diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 000000000..740acdeff --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,2 @@ +# See go/codeowners - automatically generated for confluentinc/kafka-connect-bigquery: +* @confluentinc/connect diff --git a/.gitignore b/.gitignore index 64577b615..ad8c22082 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,10 @@ bin/ build/ +target/ *.class *.jar *.tar +*.zip .gradle @@ -22,6 +24,6 @@ build/ key.json test.conf -kcbq-connector/src/integration-test/resources/test.properties +kcbq-connector/src/test/resources/test.properties kcbq-connector/test/docker/connect/properties/ kcbq-connector/out/ diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index f1b4fbf59..000000000 --- a/.travis.yml +++ /dev/null @@ -1,32 +0,0 @@ -language: java -sudo: true -dist: trusty -group: edge - -jdk: - - oraclejdk8 - - openjdk8 - - openjdk11 - -matrix: - fast_finish: true - -script: - - ./gradlew test - -after_success: - - if [ -e ./gradlew ]; then ./gradlew jacocoTestReport; else gradle jacocoTestReport; fi - - bash <(curl -s https://codecov.io/bash) - -before_cache: - - rm -f $HOME/.gradle/caches/modules-2/modules-2.lock - - rm -fr $HOME/.gradle/caches/*/plugin-resolution/ - -cache: - directories: - - $HOME/.gradle/caches/ - - $HOME/.gradle/wrapper/ - -notifications: - email: - - open-source@wepay.com diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 000000000..8771cc3e4 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,30 @@ +#!/usr/bin/env groovy +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +common { + slackChannel = '#connect-warn' + nodeLabel = 'docker-oraclejdk8' + publish = false + downStreamValidate = false + secret_file_list = [ + ['gcp/kcbq', 'creds', '/tmp/creds.json', 'KCBQ_TEST_KEYFILE'], + ['gcp/kcbq', 'creds', '/tmp/creds.json', 'GOOGLE_APPLICATION_CREDENTIALS'] + ] + timeoutHours = 2 +} diff --git a/README.md b/README.md index 6bbfc4a9a..fe7d0917b 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,14 @@ This is an implementation of a sink connector from [Apache Kafka] to [Google BigQuery], built on top of [Apache Kafka Connect]. For a comprehensive list of configuration options, see the [Connector Configuration Wiki]. +## History + +This connector was [originally developed by WePay](https://github.com/wepay/kafka-connect-bigquery). +In late 2020 the project moved to [Confluent](https://github.com/confluentinc/kafka-connect-bigquery), +with both companies taking on maintenance duties. All new activity such as filing issues and opening +pull requests should now target the [Confluent](https://github.com/confluentinc/kafka-connect-bigquery) +fork of the project. + ## Download The latest releases are available in the GitHub release tab, or via [tarballs in Maven central](http://search.maven.org/#search%7Cga%7C1%7Ca%3A%22kcbq-connector%22). @@ -40,8 +48,19 @@ save the properties file. Once you get more familiar with the connector, you might want to revisit the `connector.properties` file and experiment with tweaking its settings. - -### Building and Extracting a Tarball + +#### Migrating to 2.x.x +In accordance with the introduction of schema unionization in version 2.0.0, the following changes +to configs have been introduced and should be made when migrating: +1. `autoUpdateSchemas` has been removed +2. `allowNewBigQueryFields` and `allowBigQueryRequiredFieldRelaxation` have been introduced +3. `allowSchemaUnionization` has been introduced + +Setting `allowNewBigQueryFields` and `allowBigQueryRequiredFieldRelaxation` to `true` while +setting `allowSchemaUnionization` to false results in the same behavior that setting `autoUpdateSchemas` +to `true` used to. + +### Building and Extracting a Confluent Hub archive If you haven't already, move into the repository's top-level directory: @@ -49,16 +68,16 @@ If you haven't already, move into the repository's top-level directory: $ cd /path/to/kafka-connect-bigquery/ ``` -Begin by creating a tarball of the connector with the Confluent Schema Retriever included: +Begin by creating Confluent Hub archive of the connector with the Confluent Schema Retriever included: ```bash -$ ./gradlew clean distTar +$ mvn clean package -DskipTests ``` And then extract its contents: ```bash -$ mkdir -p bin/jar/ && tar -C bin/jar/ -xf kcbq-confluent/build/distributions/kcbq-confluent-*.tar +$ mkdir -p bin/jar/ && cp kcbq-connector/target/components/packages/wepay-kafka-connect-bigquery-*/wepay-kafka-connect-bigquery-*/lib/*.jar bin/jar/ ``` ### Setting-Up Background Processes @@ -131,22 +150,54 @@ adjusting flags given to the Avro Console Producer and tweaking the config setti ## Integration Testing the Connector -> **NOTE**: You must have [Docker] installed and running on your machine in order to run integration -tests for the connector. +### Configuring the tests + +You must supply the following environment variables in order to run the tests: + +- `$KCBQ_TEST_PROJECT`: The name of the BigQuery project to use for the test +- `$KCBQ_TEST_DATASET`: The name of the BigQuery dataset to use for the test +- `$KCBQ_TEST_KEYFILE`: The key file used to authenticate with BigQuery during the test +- `$KCBQ_TEST_BUCKET`: The name of the GCS bucket to use (for testing the GCS batch loading feature) + +The `$KCBQ_TEST_FOLDER` variable can be supplied to specify which subfolder of the GCS bucket should +be used when testing the GCS batch loading feature; if not supplied, the top-level folder will be +used. + +### Adding new GCP Credentials & BigQuery DataSet +This section is optional in case one wants to use a different GCP project and generate new creds for that +- **Create a GCP Service Account:** Follow instructions from https://cloud.google.com/iam/docs/creating-managing-service-accounts e.g. +``` +gcloud iam service-accounts create kcbq-test --description="service account key for bigquery sink integration test" --display-name="kcbq-test" +``` +- **Create Service Account Keys:** Follow instructions from https://cloud.google.com/iam/docs/creating-managing-service-account-keys e.g. +``` +gcloud iam service-accounts keys create /tmp/creds.json --iam-account=kcbq-test@.iam.gserviceaccount.com +``` +- **Give BigQuery & Storage Admin Permissions to Service Account:** + - Open https://console.cloud.google.com/iam-admin/iam?project= + - Click on Add and enter New Principal as created above e.g. `kcbq-test@.iam.gserviceaccount.com` + - Add following 2 roles from "Select a role" drop down menu: + - BigQuery -> BigQuery Admin + - Cloud Storage -> Storage Admin +- **Add a BigQuery DataSet into the Project:** + - Open https://console.cloud.google.com/bigquery?project= + - Click on the 3 vertical dots against the project name and click on "Create dataset" and follow the steps there. + +### Running the Integration Tests -This all takes place in the `kcbq-connector` directory. +```bash +# (Re)builds the project and runs the integration tests, skipping unit tests to save a bit of time +mvn clean package integration-test -Dskip.unit.tests=true +``` ### How Integration Testing Works -Integration tests run by creating [Docker] instances for [Zookeeper], [Kafka], [Schema Registry], +Integration tests run by creating embedded instances for [Zookeeper], [Kafka], [Schema Registry], and the BigQuery Connector itself, then verifying the results using a [JUnit] test. -They use schemas and data that can be found in the `test/docker/populate/test_schemas/` directory, -and rely on a user-provided JSON key file (like in the `quickstart` example) to access BigQuery. - -The project and dataset they write to, as well as the specific JSON key file they use, can be -specified by command-line flag, environment variable, or configuration file — the exact details of -each can be found by running the integration test script with the `-?` flag. +They use schemas and data that can be found in the +`kcbq-connector/src/test/resources/integration_test_cases/` directory, and rely on a user-provided +JSON key file (like in the `quickstart` example) to access BigQuery. ### Data Corruption Concerns @@ -156,70 +207,49 @@ if you have any tables in your dataset whose names begin with `kcbq_test_` and m name of any of the `test_schema` subdirectories. If that is the case, you should probably consider writing to a different project/dataset. -Because Kafka and Schema Registry are run in Docker, there is no risk that running integration -tests will corrupt any existing data that is already on your machine, and there is also no need to -free up any of your ports that might currently be in use by real instances of the programs that are -faked in the process of testing. - -### Running the Integration Tests - -Running the series of integration tests is easy: - -```bash -$ test/integrationtest.sh -``` - -This assumes that the project, dataset, and key file have been specified by variable or -configuration file. For more information on how to specify these, run the test script with -the `--help` flag. - -> **NOTE:** You must have a recent version of [boot2docker], [Docker Machine], [Docker], etc. -installed. Older versions will hang when cleaning containers, and linking doesn't work properly. +Kafka, Schema Registry, Zookeeper, and Kafka Connect are all run as temporary embedded instances, so +there is no risk that running integration tests will corrupt any existing data that is already on +your machine, and there is also no need to free up any of your ports that might currently be in use +by instances of the services that are brought up in the process of testing. ### Adding New Integration Tests Adding an integration test is a little more involved, and consists of two major steps: specifying -Avro data to be sent to Kafka, and specifying via JUnit test how to verify that such data made +Avro data to be sent to Kafka, and specifying via JUnit test how to verify that such data made it to BigQuery as expected. -To specify input data, you must create a new directory in the `test/resources/test_schemas/` -directory with whatever name you want the Kafka topic of your test to be named, and whatever -string you want the name of your test's BigQuery table to be derived from. Then, create two files -in that directory: +To specify input data, you must create a new directory in the +`kcbq-connector/src/test/resources/integration_test_cases/` directory with whatever name you want +the Kafka topic of your test to be named, and whatever string you want the name of your test's +BigQuery table to be derived from. Then, create two files in that directory: * `schema.json` will contain the Avro schema of the type of data the new test will send through the connector. -* `data.json` will contain a series of JSON objects, each of which should represent an [Avro] record -that matches the specified schema. **Each JSON object must occupy its own line, and each object -cannot occupy more than one line** (this inconvenience is due to limitations in the Avro +* `data.json` will contain a series of JSON objects, each of which should represent an [Avro] record +that matches the specified schema. **Each JSON object must occupy its own line, and each object +cannot occupy more than one line** (this inconvenience is due to limitations in the Avro Console Producer, and may be addressed in future commits). -To specify data verification, add a new JUnit test to the file -`src/integration-test/java/com/wepay/kafka/connect/bigquery/it/BigQueryConnectorIntegrationTest.java`. -Rows that are retrieved from BigQuery in the test are only returned as _Lists_ of _Objects_. The -names of their columns are not tracked. Construct a _List_ of the _Objects_ that you expect to be -stored in the test's BigQuery table, retrieve the actual _List_ of _Objects_ stored via a call to -`readAllRows()`, and then compare the two via a call to `testRows()`. +To specify data verification, add to the test cases present in the +`kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/BigQuerySinkConnectorIT.java` > **NOTE**: Because the order of rows is not guaranteed when reading test results from BigQuery, -you must include a row number as the first field of any of your test schemas, and every row of test -data must have a unique value for its row number (row numbers are one-indexed). +you must include a numeric column named "row" number in all of your test schemas, and every row of +test data must have a unique value for its row number. When data is read back from BigQuery to +verify its accuracy, it will be returned in ascending order based on that "row" column. [Apache Avro]: https://avro.apache.org - [Apache Kafka Connect]: http://docs.confluent.io/3.0.0/connect/ + [Apache Kafka Connect]: http://docs.confluent.io/current/connect/ [Apache Kafka]: http://kafka.apache.org [Apache Maven]: https://maven.apache.org [Avro]: https://avro.apache.org [BigQuery]: https://cloud.google.com/bigquery/ - [boot2docker]: http://boot2docker.io - [Confluent Platform]: http://docs.confluent.io/3.0.0/installation.html + [Confluent Platform]: http://docs.confluent.io/current/installation.html [Connector Configuration Wiki]: https://github.com/wepay/kafka-connect-bigquery/wiki/Connector-Configuration - [Docker Machine]: https://docs.docker.com/machine/ - [Docker]: https://www.docker.com [Google BigQuery]: https://cloud.google.com/bigquery/ [JUnit]: http://junit.org - [Kafka Connect]: http://docs.confluent.io/3.0.0/connect/ + [Kafka Connect]: http://docs.confluent.io/current/connect/ [Kafka]: http://kafka.apache.org [Maven]: https://maven.apache.org [Schema Registry]: https://github.com/confluentinc/schema-registry diff --git a/build.gradle b/build.gradle deleted file mode 100644 index 1652f5af4..000000000 --- a/build.gradle +++ /dev/null @@ -1,369 +0,0 @@ -plugins { - id "com.github.spotbugs" version "2.0.0" -} - -// BEGIN ALL PROJECTS // -allprojects { - apply plugin: 'java' - - sourceCompatibility = JavaVersion.VERSION_1_8 - targetCompatibility = JavaVersion.VERSION_1_8 -} - -def withoutKafka = { - exclude group: 'org.apache.kafka', module: 'connect-api' - exclude group: 'org.apache.kafka', module: 'connect-transforms' - exclude group: 'org.apache.kafka', module: 'kafka-clients' -} - -// END ALL PROJECTS - -project.ext { - apacheHttpClientVersion = '4.5.6' - avroVersion = '1.8.1' - debeziumVersion = '0.6.1' - googleCloudVersion = '1.79.0' - googleAuthVersion = '0.9.0' - googleCloudGsonVersion = '2.8.5' - ioConfluentVersion = '5.5.0' - junitVersion = '4.12' - kafkaVersion = '2.5.0' - mockitoVersion = '3.2.4' - slf4jVersion = '1.6.1' -} - -// BEGIN SUBPROJECTS // -subprojects { subproject -> - - apply plugin: 'maven' - apply plugin: 'signing' - apply plugin: 'checkstyle' - apply plugin: 'idea' - apply plugin: 'eclipse' - - jar.baseName = subproject.name - - [compileJava, compileTestJava].each { - it.options.compilerArgs << '-Xlint:unchecked' - } - - checkstyle { - configFile = file("${rootDir}/config/checkstyle/google_checks.xml") - toolVersion = '6.18' - } - - spotbugsMain { - reports { - xml.enabled = false - html.enabled = true - } - } - - task javadocJar(type: Jar) { - classifier = 'javadoc' - from javadoc - } - - task sourcesJar(type: Jar) { - classifier = 'sources' - from sourceSets.main.allSource - } - - signing { - sign configurations.archives - required { - gradle.taskGraph.hasTask('uploadArchives') - } - } - - uploadArchives { - repositories { - mavenDeployer { - beforeDeployment { - MavenDeployment deployment -> signing.signPom(deployment) - } - - repository(url: 'https://oss.sonatype.org/service/local/staging/deploy/maven2') { - authentication(userName: findProperty('ossrhUsername') ?: '', password: findProperty('ossrhPassword') ?: '') - } - - snapshotRepository(url: 'https://oss.sonatype.org/content/repositories/snapshots') { - authentication(userName: findProperty('ossrhUsername') ?: '', password: findProperty('ossrhPassword') ?: '') - } - - pom.project { - - licenses { - license { - name 'The Apache License, Version 2.0' - url 'http://www.apache.org/licenses/LICENSE-2.0.txt' - } - } - - scm { - connection 'scm:git:git://github.com/wepay/kafka-connect-bigquery.git' - developerConnection 'scm:git:ssh://github.com:wepay/kafka-connect-bigquery.git' - url 'https://github.com/wepay/kafka-connect-bigquery' - } - - developers { - developer { - id 'C0urante' - name 'Chris Egerton' - email 'fearthecellos@gmail.comw' - } - - developer { - id 'moirat' - name 'Moira Tagle' - email 'moirat@wepay.com' - } - } - } - } - } - } -} -// END SUBPROJECTS - -// BEGIN INDIVIDUAL PROJECTS -project(':kcbq-connector') { - apply plugin: 'jacoco' - - jar { - manifest { - attributes 'Implementation-Title': 'Kafka Connect BigQuery Connector', - 'Implementation-Version': version - - } - } - - repositories { - mavenCentral() - } - - sourceSets { - integrationTest { - java { - compileClasspath += main.output - runtimeClasspath += main.output - srcDir file('src/integration-test/java') - } - resources.srcDir file('src/integration-test/resources') - } - } - - task integrationTestPrep() { - dependsOn 'integrationTestTablePrep' - dependsOn 'integrationTestBucketPrep' - } - - task integrationTestTablePrep(type: JavaExec) { - main = 'com.wepay.kafka.connect.bigquery.it.utils.TableClearer' - classpath = sourceSets.integrationTest.runtimeClasspath - args findProperty('kcbq_test_keyfile') ?: '' - args findProperty('kcbq_test_project') ?: '' - args findProperty('kcbq_test_dataset') ?: '' - args findProperty('kcbq_test_keysource') ?: '' - if (findProperty('kcbq_test_tables') != null) - args findProperty('kcbq_test_tables').split(' ') - } - - task integrationTestBucketPrep(type: JavaExec) { - main = 'com.wepay.kafka.connect.bigquery.it.utils.BucketClearer' - classpath = sourceSets.integrationTest.runtimeClasspath - args findProperty('kcbq_test_keyfile') ?: '' - args findProperty('kcbq_test_project') ?: '' - args findProperty('kcbq_test_bucket') ?: '' - args findProperty('kcbq_test_keysource') ?: '' - } - - task integrationTest(type: Test) { - testClassesDirs = sourceSets.integrationTest.output.classesDirs - classpath = sourceSets.integrationTest.runtimeClasspath - } - - compileIntegrationTestJava.options.compilerArgs << '-Xlint:unchecked' - - configurations { - integrationTestCompile.extendsFrom testCompile - integrationTestRuntime.extendsFrom testRuntime - } - - javadoc { - options.links 'http://docs.oracle.com/javase/8/docs/api/' - options.links 'http://docs.confluent.io/3.2.0/connect/javadocs/' - options.links 'https://googleapis.dev/java/google-cloud-clients/0.97.0-alpha/' - options.links 'https://kafka.apache.org/0100/javadoc/' - options.links 'https://avro.apache.org/docs/1.8.1/api/java/' - } - - jacocoTestReport { - reports { - html.destination file("${buildDir}/reports/jacoco/") - xml.enabled true - } - } - - dependencies { - compile ( - project(':kcbq-api'), - - "com.google.cloud:google-cloud-bigquery:$googleCloudVersion", - "com.google.cloud:google-cloud-storage:$googleCloudVersion", - "com.google.auth:google-auth-library-oauth2-http:$googleAuthVersion", - "com.google.code.gson:gson:$googleCloudGsonVersion", - "org.slf4j:slf4j-api:$slf4jVersion", - ) - - compile "io.debezium:debezium-core:$debeziumVersion", withoutKafka - - compileOnly ( - "org.apache.kafka:connect-api:$kafkaVersion" - ) - - testCompile ( - "junit:junit:$junitVersion", - "org.mockito:mockito-core:$mockitoVersion", - "org.mockito:mockito-inline:$mockitoVersion", - "org.apache.kafka:connect-api:$kafkaVersion" - ) - } - - artifacts { - archives javadocJar, sourcesJar - } - - uploadArchives { - repositories { - mavenDeployer { - pom.project { - name 'Kafka Connect BigQuery Connector' - packaging 'jar' - description 'A Kafka Connector used to load data into BigQuery' - url 'https://github.com/wepay/kafka-connect-bigquery' - } - } - } - } -} - -project('kcbq-api') { - jar { - manifest { - attributes 'Implementation-Title': 'Kafka Connect BigQuery API', - 'Implementation-Version': version - } - } - - repositories { - mavenCentral() - } - - javadoc { - options.links 'http://docs.oracle.com/javase/8/docs/api/' - options.links 'http://docs.confluent.io/3.2.0/connect/javadocs/' - } - - dependencies { - compile "com.google.cloud:google-cloud-bigquery:$googleCloudVersion" - - compileOnly "org.apache.kafka:connect-api:$kafkaVersion" - } - - artifacts { - archives javadocJar, sourcesJar - } - - uploadArchives { - repositories { - mavenDeployer { - pom.project { - name 'Kafka Connect BigQuery Connector API' - packaging 'jar' - description 'A small API for the Kafka Connector used to load data into BigQuery' - url 'https://github.com/wepay/kafka-connect-bigquery' - } - } - } - } -} - -project('kcbq-confluent') { - apply plugin: 'distribution' - - distributions { - main { - baseName = 'kcbq-confluent' - contents { - from configurations.runtime, jar - } - } - } - - jar { - manifest { - attributes 'Implementation-Title': 'Kafka Connect BigQuery Schema Registry Schema Retriever', - 'Implementation-Version': version - } - } - - repositories { - mavenCentral() - maven { - url 'http://packages.confluent.io/maven' - } - jcenter() - } - - javadoc { - options.links 'http://docs.oracle.com/javase/8/docs/api/' - options.links 'http://docs.confluent.io/3.2.0/connect/javadocs/' - } - - dependencies { - - compile ( - project(':kcbq-connector'), - project(':kcbq-api'), - - "org.apache.avro:avro:$avroVersion", - "org.slf4j:slf4j-api:$slf4jVersion", - ) - - compile "io.confluent:kafka-connect-avro-converter:$ioConfluentVersion", withoutKafka - compile "io.confluent:kafka-schema-registry-client:$ioConfluentVersion", withoutKafka - - compileOnly ( - "org.apache.kafka:connect-api:$kafkaVersion", - "org.apache.kafka:kafka-clients:$kafkaVersion" - ) - - testCompile ( - "junit:junit:$junitVersion", - "org.mockito:mockito-core:$mockitoVersion", - "org.mockito:mockito-inline:$mockitoVersion", - "org.apache.kafka:connect-api:$kafkaVersion", - "org.apache.kafka:kafka-clients:$kafkaVersion" - - ) - } - - artifacts { - archives javadocJar, sourcesJar, distTar - } - - uploadArchives { - repositories { - mavenDeployer { - pom.project { - name 'Kafka Connect BigQuery Connector Schema Registry Schema Retriever' - packaging 'jar' - description 'A Schema Registry-based schema retriever for the Kafka Connector used to load data into BigQuery' - url 'https://github.com/wepay/kafka-connect-bigquery' - } - } - } - } -} -// END INDIVIDUAL PROJECTS diff --git a/codecov.yml b/codecov.yml deleted file mode 100644 index c644d5794..000000000 --- a/codecov.yml +++ /dev/null @@ -1,43 +0,0 @@ -codecov: - branch: master - bot: skyzyx - -coverage: - precision: 2 - round: down - range: "70...100" - - status: - project: - default: - target: auto - threshold: 1.25 - branches: - - master - - feature/* - - patch: - default: - target: auto - branches: - - master - - feature/* - - changes: - default: - branches: - - master - - feature/* - - ignore: - - config/.* - - gradle/.* - - test/.* - - .*/vendor/.* - -comment: - layout: "header, diff, changes, sunburst, uncovered, tree" - behavior: default - branches: - - master - - feature/* diff --git a/config/checkstyle/suppressions.xml b/config/checkstyle/suppressions.xml new file mode 100644 index 000000000..f7f6089d5 --- /dev/null +++ b/config/checkstyle/suppressions.xml @@ -0,0 +1,27 @@ + + + + + + \ No newline at end of file diff --git a/config/copyright/custom-header-styles.xml b/config/copyright/custom-header-styles.xml new file mode 100644 index 000000000..4b296d70b --- /dev/null +++ b/config/copyright/custom-header-styles.xml @@ -0,0 +1,44 @@ + + + + + /* + * + */EOL + (\s|\t)*/\*.*$ + .*\*/(\s|\t)*$ + false + true + false + + + /* + * + */ + #!.* + (\s|\t)*/\*.* + .*\*/(\s|\t)*$ + false + true + false + + \ No newline at end of file diff --git a/gradle.properties b/gradle.properties deleted file mode 100644 index 7e259101b..000000000 --- a/gradle.properties +++ /dev/null @@ -1,2 +0,0 @@ -group=com.wepay.kcbq -version=1.6.5 diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar deleted file mode 100644 index 94336fcae..000000000 Binary files a/gradle/wrapper/gradle-wrapper.jar and /dev/null differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties deleted file mode 100644 index b0acbdcd7..000000000 --- a/gradle/wrapper/gradle-wrapper.properties +++ /dev/null @@ -1,5 +0,0 @@ -distributionBase=GRADLE_USER_HOME -distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-5.5-bin.zip -zipStoreBase=GRADLE_USER_HOME -zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew deleted file mode 100755 index cccdd3d51..000000000 --- a/gradlew +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env sh - -############################################################################## -## -## Gradle start up script for UN*X -## -############################################################################## - -# Attempt to set APP_HOME -# Resolve links: $0 may be a link -PRG="$0" -# Need this for relative symlinks. -while [ -h "$PRG" ] ; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`"/$link" - fi -done -SAVED="`pwd`" -cd "`dirname \"$PRG\"`/" >/dev/null -APP_HOME="`pwd -P`" -cd "$SAVED" >/dev/null - -APP_NAME="Gradle" -APP_BASE_NAME=`basename "$0"` - -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS="" - -# Use the maximum available, or set MAX_FD != -1 to use that value. -MAX_FD="maximum" - -warn () { - echo "$*" -} - -die () { - echo - echo "$*" - echo - exit 1 -} - -# OS specific support (must be 'true' or 'false'). -cygwin=false -msys=false -darwin=false -nonstop=false -case "`uname`" in - CYGWIN* ) - cygwin=true - ;; - Darwin* ) - darwin=true - ;; - MINGW* ) - msys=true - ;; - NONSTOP* ) - nonstop=true - ;; -esac - -CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar - -# Determine the Java command to use to start the JVM. -if [ -n "$JAVA_HOME" ] ; then - if [ -x "$JAVA_HOME/jre/sh/java" ] ; then - # IBM's JDK on AIX uses strange locations for the executables - JAVACMD="$JAVA_HOME/jre/sh/java" - else - JAVACMD="$JAVA_HOME/bin/java" - fi - if [ ! -x "$JAVACMD" ] ; then - die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME - -Please set the JAVA_HOME variable in your environment to match the -location of your Java installation." - fi -else - JAVACMD="java" - which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. - -Please set the JAVA_HOME variable in your environment to match the -location of your Java installation." -fi - -# Increase the maximum file descriptors if we can. -if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then - MAX_FD_LIMIT=`ulimit -H -n` - if [ $? -eq 0 ] ; then - if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then - MAX_FD="$MAX_FD_LIMIT" - fi - ulimit -n $MAX_FD - if [ $? -ne 0 ] ; then - warn "Could not set maximum file descriptor limit: $MAX_FD" - fi - else - warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" - fi -fi - -# For Darwin, add options to specify how the application appears in the dock -if $darwin; then - GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" -fi - -# For Cygwin, switch paths to Windows format before running java -if $cygwin ; then - APP_HOME=`cygpath --path --mixed "$APP_HOME"` - CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` - JAVACMD=`cygpath --unix "$JAVACMD"` - - # We build the pattern for arguments to be converted via cygpath - ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` - SEP="" - for dir in $ROOTDIRSRAW ; do - ROOTDIRS="$ROOTDIRS$SEP$dir" - SEP="|" - done - OURCYGPATTERN="(^($ROOTDIRS))" - # Add a user-defined pattern to the cygpath arguments - if [ "$GRADLE_CYGPATTERN" != "" ] ; then - OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" - fi - # Now convert the arguments - kludge to limit ourselves to /bin/sh - i=0 - for arg in "$@" ; do - CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` - CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option - - if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition - eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` - else - eval `echo args$i`="\"$arg\"" - fi - i=$((i+1)) - done - case $i in - (0) set -- ;; - (1) set -- "$args0" ;; - (2) set -- "$args0" "$args1" ;; - (3) set -- "$args0" "$args1" "$args2" ;; - (4) set -- "$args0" "$args1" "$args2" "$args3" ;; - (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; - (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; - (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; - (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; - (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; - esac -fi - -# Escape application args -save () { - for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done - echo " " -} -APP_ARGS=$(save "$@") - -# Collect all arguments for the java command, following the shell quoting and substitution rules -eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" - -# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong -if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then - cd "$(dirname "$0")" -fi - -exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat deleted file mode 100644 index e95643d6a..000000000 --- a/gradlew.bat +++ /dev/null @@ -1,84 +0,0 @@ -@if "%DEBUG%" == "" @echo off -@rem ########################################################################## -@rem -@rem Gradle startup script for Windows -@rem -@rem ########################################################################## - -@rem Set local scope for the variables with windows NT shell -if "%OS%"=="Windows_NT" setlocal - -set DIRNAME=%~dp0 -if "%DIRNAME%" == "" set DIRNAME=. -set APP_BASE_NAME=%~n0 -set APP_HOME=%DIRNAME% - -@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -set DEFAULT_JVM_OPTS= - -@rem Find java.exe -if defined JAVA_HOME goto findJavaFromJavaHome - -set JAVA_EXE=java.exe -%JAVA_EXE% -version >NUL 2>&1 -if "%ERRORLEVEL%" == "0" goto init - -echo. -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:findJavaFromJavaHome -set JAVA_HOME=%JAVA_HOME:"=% -set JAVA_EXE=%JAVA_HOME%/bin/java.exe - -if exist "%JAVA_EXE%" goto init - -echo. -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:init -@rem Get command-line arguments, handling Windows variants - -if not "%OS%" == "Windows_NT" goto win9xME_args - -:win9xME_args -@rem Slurp the command line arguments. -set CMD_LINE_ARGS= -set _SKIP=2 - -:win9xME_args_slurp -if "x%~1" == "x" goto execute - -set CMD_LINE_ARGS=%* - -:execute -@rem Setup the command line - -set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar - -@rem Execute Gradle -"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% - -:end -@rem End local scope for the variables with windows NT shell -if "%ERRORLEVEL%"=="0" goto mainEnd - -:fail -rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of -rem the _cmd.exe /c_ return code! -if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 -exit /b 1 - -:mainEnd -if "%OS%"=="Windows_NT" endlocal - -:omega diff --git a/kcbq-api/pom.xml b/kcbq-api/pom.xml new file mode 100644 index 000000000..1b46a5291 --- /dev/null +++ b/kcbq-api/pom.xml @@ -0,0 +1,63 @@ + + + + 4.0.0 + + + com.wepay.kcbq + kcbq-parent + 2.5.0-SNAPSHOT + .. + + + kcbq-api + kafka-connect-bigquery-api + + + ${project.parent.basedir} + + + + + org.apache.kafka + connect-api + + + + com.google.cloud + google-cloud-bigquery + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + org.apache.maven.plugins + maven-checkstyle-plugin + + + + diff --git a/kcbq-api/src/main/java/com/wepay/kafka/connect/bigquery/api/KafkaSchemaRecordType.java b/kcbq-api/src/main/java/com/wepay/kafka/connect/bigquery/api/KafkaSchemaRecordType.java index 8b197c416..b7b1b0c0b 100644 --- a/kcbq-api/src/main/java/com/wepay/kafka/connect/bigquery/api/KafkaSchemaRecordType.java +++ b/kcbq-api/src/main/java/com/wepay/kafka/connect/bigquery/api/KafkaSchemaRecordType.java @@ -1,3 +1,22 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package com.wepay.kafka.connect.bigquery.api; diff --git a/kcbq-api/src/main/java/com/wepay/kafka/connect/bigquery/api/SchemaRetriever.java b/kcbq-api/src/main/java/com/wepay/kafka/connect/bigquery/api/SchemaRetriever.java index 704ec5828..cba0c4aa5 100644 --- a/kcbq-api/src/main/java/com/wepay/kafka/connect/bigquery/api/SchemaRetriever.java +++ b/kcbq-api/src/main/java/com/wepay/kafka/connect/bigquery/api/SchemaRetriever.java @@ -1,3 +1,22 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package com.wepay.kafka.connect.bigquery.api; import org.apache.kafka.connect.data.Schema; @@ -30,5 +49,4 @@ public interface SchemaRetriever { * @return The value Schema for the given record. */ Schema retrieveValueSchema(SinkRecord record); - } diff --git a/kcbq-connector/logos/BigQuery.png b/kcbq-connector/logos/BigQuery.png new file mode 100644 index 000000000..a7e0a7156 Binary files /dev/null and b/kcbq-connector/logos/BigQuery.png differ diff --git a/kcbq-connector/logos/confluent.png b/kcbq-connector/logos/confluent.png new file mode 100644 index 000000000..14cd8c506 Binary files /dev/null and b/kcbq-connector/logos/confluent.png differ diff --git a/kcbq-connector/pom.xml b/kcbq-connector/pom.xml new file mode 100644 index 000000000..3d0b5978e --- /dev/null +++ b/kcbq-connector/pom.xml @@ -0,0 +1,214 @@ + + + + 4.0.0 + + + com.wepay.kcbq + kcbq-parent + 2.5.0-SNAPSHOT + .. + + + kcbq-connector + kafka-connect-bigquery + + + ${project.parent.basedir} + + + + + org.apache.kafka + connect-api + + + + com.fasterxml.jackson.core + jackson-core + + + + com.google.cloud + google-cloud-bigquery + + + com.google.cloud + google-cloud-storage + + + com.google.auth + google-auth-library-oauth2-http + + + org.slf4j + slf4j-api + + + io.debezium + debezium-core + + + + com.wepay.kcbq + kcbq-api + + + + junit + junit + + + org.mockito + mockito-core + + + org.slf4j + slf4j-log4j12 + + + org.apache.kafka + kafka_${kafka.scala.version} + + + org.apache.kafka + kafka_${kafka.scala.version} + test + test-jar + + + org.apache.kafka + kafka-clients + test + test-jar + + + org.apache.kafka + connect-runtime + + + org.apache.kafka + connect-runtime + test + test-jar + + + io.confluent + kafka-schema-registry + + + io.confluent + kafka-schema-registry + tests + test-jar + + + io.confluent + kafka-connect-avro-converter + + + io.confluent + kafka-avro-serializer + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + org.apache.maven.plugins + maven-surefire-plugin + + + org.apache.maven.plugins + maven-failsafe-plugin + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-checkstyle-plugin + + + io.confluent + kafka-connect-maven-plugin + + + + kafka-connect + + + BigQuery Sink Connector + kafka-connect-bigquery + + A sink connector for writing to Google BigQuery, with support for automatic table creation and schema evolution. + + logos/BigQuery.png + https://docs.confluent.io/kafka-connect-bigquery/current/index.html + https://github.com/confluentinc/kafka-connect-bigquery + + Confluent, Inc. + supported by Confluent as part of a Confluent Platform subscription.]]> + https://docs.confluent.io/kafka-connect-bigquery/current/index.html + logos/confluent.png + + wepay + organization + WePay + https://go.wepay.com/ + + true + + + sink + + + + cloud + analytics + data + gcp + google + bigquery + warehouse + platform + nosql + + + + Apache Kafka 0.11 or higher / Confluent Platform 3.3 or higher + Java 1.8 or higher + Active Google Cloud Platform (GCP) account with authorization to create resources + Kafka Connect 0.11 or higher / Confluent Platform 3.3 or higher + + + + + + + + diff --git a/kcbq-connector/quickstart/avro-console-producer.sh b/kcbq-connector/quickstart/avro-console-producer.sh index a7fe02118..9065f0cb3 100755 --- a/kcbq-connector/quickstart/avro-console-producer.sh +++ b/kcbq-connector/quickstart/avro-console-producer.sh @@ -1,5 +1,8 @@ #! /usr/bin/env bash -# Copyright 2016 WePay, Inc. +# +# Copyright 2020 Confluent, Inc. +# +# This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# BASE_DIR=`dirname "$0"` diff --git a/kcbq-connector/quickstart/connector.sh b/kcbq-connector/quickstart/connector.sh index 5c9dcecd9..123e9bbe9 100755 --- a/kcbq-connector/quickstart/connector.sh +++ b/kcbq-connector/quickstart/connector.sh @@ -1,5 +1,8 @@ #! /usr/bin/env bash -# Copyright 2016 WePay, Inc. +# +# Copyright 2020 Confluent, Inc. +# +# This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# BASE_DIR="$(cd "$(dirname "$0")" && pwd)" diff --git a/kcbq-connector/quickstart/kafka.sh b/kcbq-connector/quickstart/kafka.sh index 953c0d3f8..2ce3391ab 100755 --- a/kcbq-connector/quickstart/kafka.sh +++ b/kcbq-connector/quickstart/kafka.sh @@ -1,5 +1,8 @@ #! /usr/bin/env bash -# Copyright 2016 WePay, Inc. +# +# Copyright 2020 Confluent, Inc. +# +# This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# BASE_DIR=`dirname "$0"` diff --git a/kcbq-connector/quickstart/properties/connector.properties b/kcbq-connector/quickstart/properties/connector.properties index ff2fd35f1..3e2aa55ce 100644 --- a/kcbq-connector/quickstart/properties/connector.properties +++ b/kcbq-connector/quickstart/properties/connector.properties @@ -1,4 +1,7 @@ -# Copyright 2016 WePay, Inc. +# +# Copyright 2020 Confluent, Inc. +# +# This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +15,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# name=bigquery-connector connector.class=com.wepay.kafka.connect.bigquery.BigQuerySinkConnector @@ -24,10 +28,6 @@ autoUpdateSchemas=true schemaRetriever=com.wepay.kafka.connect.bigquery.retrieve.IdentitySchemaRetriever -bufferSize=100000 -maxWriteSize=10000 -tableWriteWait=1000 - # An example regex router SMT that strips (kcbq_) from the topic name. # Replace with relevant regex to replace the topic of each sink record with # destination dataset and table name in the format : or only the destination diff --git a/kcbq-connector/quickstart/properties/standalone.properties b/kcbq-connector/quickstart/properties/standalone.properties index 2aee81055..1450e07cc 100644 --- a/kcbq-connector/quickstart/properties/standalone.properties +++ b/kcbq-connector/quickstart/properties/standalone.properties @@ -1,4 +1,7 @@ -# Copyright 2016 WePay, Inc. +# +# Copyright 2020 Confluent, Inc. +# +# This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +15,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# bootstrap.servers=localhost:9092 key.converter=io.confluent.connect.avro.AvroConverter diff --git a/kcbq-connector/quickstart/schema-registry.sh b/kcbq-connector/quickstart/schema-registry.sh index 5b5dfd3a6..61735fabc 100755 --- a/kcbq-connector/quickstart/schema-registry.sh +++ b/kcbq-connector/quickstart/schema-registry.sh @@ -1,5 +1,8 @@ #! /usr/bin/env bash -# Copyright 2016 WePay, Inc. +# +# Copyright 2020 Confluent, Inc. +# +# This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# BASE_DIR=`dirname "$0"` diff --git a/kcbq-connector/quickstart/zookeeper.sh b/kcbq-connector/quickstart/zookeeper.sh index ad5a88205..3e5fcbdcc 100755 --- a/kcbq-connector/quickstart/zookeeper.sh +++ b/kcbq-connector/quickstart/zookeeper.sh @@ -1,5 +1,8 @@ #! /usr/bin/env bash -# Copyright 2016 WePay, Inc. +# +# Copyright 2020 Confluent, Inc. +# +# This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# BASE_DIR=`dirname "$0"` diff --git a/kcbq-connector/src/integration-test/java/com/wepay/kafka/connect/bigquery/it/BigQueryConnectorIntegrationTest.java b/kcbq-connector/src/integration-test/java/com/wepay/kafka/connect/bigquery/it/BigQueryConnectorIntegrationTest.java deleted file mode 100644 index 46775b418..000000000 --- a/kcbq-connector/src/integration-test/java/com/wepay/kafka/connect/bigquery/it/BigQueryConnectorIntegrationTest.java +++ /dev/null @@ -1,393 +0,0 @@ -package com.wepay.kafka.connect.bigquery.it; - -/* - * Copyright 2016 WePay, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - -import static com.google.cloud.bigquery.LegacySQLTypeName.BOOLEAN; -import static com.google.cloud.bigquery.LegacySQLTypeName.BYTES; -import static com.google.cloud.bigquery.LegacySQLTypeName.DATE; -import static com.google.cloud.bigquery.LegacySQLTypeName.FLOAT; -import static com.google.cloud.bigquery.LegacySQLTypeName.INTEGER; -import static com.google.cloud.bigquery.LegacySQLTypeName.STRING; -import static com.google.cloud.bigquery.LegacySQLTypeName.TIMESTAMP; - -import static org.junit.Assert.assertEquals; - -import com.google.cloud.bigquery.BigQuery; -import com.google.cloud.bigquery.Field; -import com.google.cloud.bigquery.FieldValue; -import com.google.cloud.bigquery.FieldValueList; -import com.google.cloud.bigquery.Schema; -import com.google.cloud.bigquery.Table; -import com.google.cloud.bigquery.TableResult; - -import com.wepay.kafka.connect.bigquery.BigQueryHelper; -import com.wepay.kafka.connect.bigquery.exception.SinkConfigConnectException; - -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.FileNotFoundException; -import java.io.InputStream; - -import java.time.LocalDate; -import java.time.ZoneOffset; -import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.Properties; - -public class BigQueryConnectorIntegrationTest { - public static final String TEST_PROPERTIES_FILENAME = "/test.properties"; - public static final String KEYFILE_PROPERTY = "keyfile"; - public static final String PROJECT_PROPERTY = "project"; - public static final String DATASET_PROPERTY = "dataset"; - public static final String KEY_SOURCE_PROPERTY = "keySource"; - - private static String keyfile; - private static String project; - private static String dataset; - private static String keySource; - - private static BigQuery bigQuery; - - @BeforeClass - public static void initialize() throws Exception { - initializeTestProperties(); - initializeBigQuery(); - } - - private static void initializeTestProperties() throws Exception { - try (InputStream propertiesFile = - BigQueryConnectorIntegrationTest.class.getResourceAsStream(TEST_PROPERTIES_FILENAME)) { - if (propertiesFile == null) { - throw new FileNotFoundException( - "Resource file '" + TEST_PROPERTIES_FILENAME - + "' must be provided in order to run integration tests" - ); - } - - Properties properties = new Properties(); - properties.load(propertiesFile); - - keyfile = properties.getProperty(KEYFILE_PROPERTY); - if (keyfile == null) { - throw new SinkConfigConnectException( - "'" + KEYFILE_PROPERTY - + "' property must be specified in test properties file" - ); - } - - project = properties.getProperty(PROJECT_PROPERTY); - if (project == null) { - throw new SinkConfigConnectException( - "'" + PROJECT_PROPERTY - + "' property must be specified in test properties file" - ); - } - - dataset = properties.getProperty(DATASET_PROPERTY); - if (dataset == null) { - throw new SinkConfigConnectException( - "'" + DATASET_PROPERTY - + "' property must be specified in test properties file" - ); - } - - keySource = properties.getProperty(KEY_SOURCE_PROPERTY); - } - } - - private static void initializeBigQuery() throws Exception { - bigQuery = new BigQueryHelper().setKeySource(keySource).connect(project, keyfile); - } - - private static List boxByteArray(byte[] bytes) { - Byte[] result = new Byte[bytes.length]; - for (int i = 0; i < bytes.length; i++) { - result[i] = bytes[i]; - } - return Arrays.asList(result); - } - - private Object convertField(Field fieldSchema, FieldValue field) { - if (field.isNull()) { - return null; - } - switch (field.getAttribute()) { - case PRIMITIVE: - if (fieldSchema.getType().equals(BOOLEAN)) { - return field.getBooleanValue(); - } else if (fieldSchema.getType().equals(BYTES)) { - // Do this in order for assertEquals() to work when this is an element of two compared - // lists - return boxByteArray(field.getBytesValue()); - } else if (fieldSchema.getType().equals(DATE)) { - DateTimeFormatter dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd"); - long millisecondsSinceEpoch = LocalDate.parse(field.getStringValue(), dateFormatter) - .atStartOfDay(ZoneOffset.UTC) - .toInstant() - .toEpochMilli(); - return millisecondsSinceEpoch; - } else if (fieldSchema.getType().equals(FLOAT)) { - return field.getDoubleValue(); - } else if (fieldSchema.getType().equals(INTEGER)) { - return field.getLongValue(); - } else if (fieldSchema.getType().equals(STRING)) { - return field.getStringValue(); - } else if (fieldSchema.getType().equals(TIMESTAMP)) { - return field.getTimestampValue(); - } else { - throw new RuntimeException("Cannot convert primitive field type " - + fieldSchema.getType()); - } - case REPEATED: - List result = new ArrayList<>(); - for (FieldValue arrayField : field.getRepeatedValue()) { - result.add(convertField(fieldSchema, arrayField)); - } - return result; - case RECORD: - List recordSchemas = fieldSchema.getSubFields(); - List recordFields = field.getRecordValue(); - return convertRow(recordSchemas, recordFields); - default: - throw new RuntimeException("Unknown field attribute: " + field.getAttribute()); - } - } - - private List convertRow(List rowSchema, List row) { - List result = new ArrayList<>(); - assert (rowSchema.size() == row.size()); - - for (int i = 0; i < rowSchema.size(); i++) { - if (rowSchema.get(i).getName().equals("row")) { - result.add(convertField(rowSchema.get(i), row.get(i))); - } - } - for (int i = 0; i < rowSchema.size(); i++) { - if (!rowSchema.get(i).getName().equals("row")) { - result.add(convertField(rowSchema.get(i), row.get(i))); - } - } - - return result; - } - - private List> readAllRows(String tableName) { - Table table = bigQuery.getTable(dataset, tableName); - Schema schema = table.getDefinition().getSchema(); - - List> rows = new ArrayList<>(); - TableResult tableResult = table.list(); - - while (tableResult != null) { - Iterable fieldValueLists = tableResult.iterateAll(); - for (FieldValueList fieldValueList : fieldValueLists) { - rows.add(convertRow(schema.getFields(), fieldValueList)); - } - tableResult = tableResult.getNextPage(); - } - return rows; - } - - @Test - public void testNull() { - List> expectedRows = new ArrayList<>(); - - // {"row":1,"f1":"Required string","f2":null,"f3":{"int":42},"f4":{"boolean":false}} - expectedRows.add(Arrays.asList(1L, "Required string", null, 42L, false)); - // {"row":2,"f1":"Required string","f2":{"string":"Optional string"},"f3":{"int":89},"f4":null} - expectedRows.add(Arrays.asList(2L, "Required string", "Optional string", 89L, null)); - // {"row":3,"f1":"Required string","f2":null,"f3":null,"f4":{"boolean":true}} - expectedRows.add(Arrays.asList(3L, "Required string", null, null, true)); - // {"row":4,"f1":"Required string","f2":{"string":"Optional string"},"f3":null,"f4":null} - expectedRows.add(Arrays.asList(4L, "Required string", "Optional string", null, null)); - - testRows(expectedRows, readAllRows("test_nulls")); - } - - @Test - public void testMatryoshka() { - List> expectedRows = new ArrayList<>(); - - /* { "row": 1, - "middle": - { "middle_array": [42.0, 42.42, 42.4242], - "inner": - { "inner_int": 42, - "inner_string": "42" - } - }, - "inner": - { "inner_int": -42, - "inner_string": "-42" - } - } */ - expectedRows.add(Arrays.asList( - 1L, - Arrays.asList( - Arrays.asList(42.0, 42.42, 42.4242), - Arrays.asList( - 42L, - "42" - ) - ), - Arrays.asList( - -42L, - "-42" - ) - )); - - testRows(expectedRows, readAllRows("test_matryoshka_dolls")); - } - - @Test - public void testPrimitives() { - List> expectedRows = new ArrayList<>(); - - /* { "row": 1, - "null_prim": null, - "boolean_prim": false, - "int_prim": 4242, - "long_prim": 42424242424242, - "float_prim": 42.42, - "double_prim": 42424242.42424242, - "string_prim": "forty-two", - "bytes_prim": "\u0000\u000f\u001e\u002d\u003c\u004b\u005a\u0069\u0078" - } */ - expectedRows.add(Arrays.asList( - 1L, - null, - false, - 4242L, - 42424242424242L, - 42.42, - 42424242.42424242, - "forty-two", - boxByteArray(new byte[] { 0x0, 0xf, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78 }) - )); - - testRows(expectedRows, readAllRows("test_primitives")); - } - - @Test - public void testLogicalTypes() { - List> expectedRows = new ArrayList<>(); - - // {"row": 1, "timestamp-test": 0, "date-test": 0} - expectedRows.add(Arrays.asList(1L, 0L, 0L)); - // {"row": 2, "timestamp-test": 42000000, "date-test": 4200} - expectedRows.add(Arrays.asList(2L, 42000000000L, 362880000000L)); - // {"row": 3, "timestamp-test": 1468275102000, "date-test": 16993} - expectedRows.add(Arrays.asList(3L, 1468275102000000L, 1468195200000L)); - - testRows(expectedRows, readAllRows("test_logical_types")); - } - - @Test - public void testGCSLoad() { - List> expectedRows = new ArrayList<>(); - - /* {"row":1, - "null_prim":null, - "boolean_prim":false, - "int_prim":4242, - "long_prim":42424242424242, - "float_prim":42.42, - "double_prim":42424242.42424242, - "string_prim":"forty-two", - "bytes_prim":"\u0000\u000f\u001e\u002d\u003c\u004b\u005a\u0069\u0078"} - */ - expectedRows.add(Arrays.asList( - 1L, - null, - false, - 4242L, - 42424242424242L, - 42.42, - 42424242.42424242, - "forty-two", - boxByteArray(new byte[] { 0x0, 0xf, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78 }) - )); - /* {"row":2, - "null_prim":{"int":5}, - "boolean_prim":true, - "int_prim":4354, - "long_prim":435443544354, - "float_prim":43.54, - "double_prim":435443.544354, - "string_prim":"forty-three", - "bytes_prim":"\u0000\u000f\u001e\u002d\u003c\u004b\u005a\u0069\u0078"} - */ - expectedRows.add(Arrays.asList( - 2L, - 5L, - true, - 4354L, - 435443544354L, - 43.54, - 435443.544354, - "forty-three", - boxByteArray(new byte[] { 0x0, 0xf, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78 }) - )); - /* {"row":3, - "null_prim":{"int":8}, - "boolean_prim":false, - "int_prim":1993, - "long_prim":199319931993, - "float_prim":19.93, - "double_prim":199319.931993, - "string_prim":"nineteen", - "bytes_prim":"\u0000\u000f\u001e\u002d\u003c\u004b\u005a\u0069\u0078"} - */ - expectedRows.add(Arrays.asList( - 3L, - 8L, - false, - 1993L, - 199319931993L, - 19.93, - 199319.931993, - "nineteen", - boxByteArray(new byte[] { 0x0, 0xf, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78 }) - )); - - testRows(expectedRows, readAllRows("test_gcs_load")); - } - - private void testRows( - List> expectedRows, - List> testRows) { - assertEquals("Number of expected rows should match", expectedRows.size(), testRows.size()); - - for (List testRow : testRows) { - int rowNumber = (int) (((Long) testRow.get(0)).longValue()); - List expectedRow = expectedRows.get(rowNumber - 1); - expectedRow.sort(Comparator.nullsLast(Comparator.comparing(Object::toString))); - testRow.sort(Comparator.nullsLast(Comparator.comparing(Object::toString))); - assertEquals( - "Row " + rowNumber + " (if these look identical, it's probably a type mismatch)", - expectedRow, - testRow - ); - } - } -} diff --git a/kcbq-connector/src/integration-test/java/com/wepay/kafka/connect/bigquery/it/utils/BucketClearer.java b/kcbq-connector/src/integration-test/java/com/wepay/kafka/connect/bigquery/it/utils/BucketClearer.java deleted file mode 100644 index 1c108f35e..000000000 --- a/kcbq-connector/src/integration-test/java/com/wepay/kafka/connect/bigquery/it/utils/BucketClearer.java +++ /dev/null @@ -1,64 +0,0 @@ -package com.wepay.kafka.connect.bigquery.it.utils; - -/* - * Copyright 2016 WePay, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - -import com.google.cloud.storage.Blob; -import com.google.cloud.storage.Bucket; - -import com.google.cloud.storage.Storage; -import com.wepay.kafka.connect.bigquery.GCSBuilder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class BucketClearer { - - private static final Logger logger = LoggerFactory.getLogger(BucketClearer.class); - private static String keySource; - - /** - * Clears tables in the given project and dataset, using a provided JSON service account key. - */ - public static void main(String[] args) { - if (args.length < 4) { - usage(); - } - if (args.length == 4) { - keySource = args[3]; - } - Storage gcs = new GCSBuilder(args[1]).setKey(args[0]).setKeySource(keySource).build(); - - String bucketName = args[2]; - Bucket bucket = gcs.get(bucketName); - if (bucket != null) { - logger.info("Deleting objects in the Bucket {}", bucketName); - for (Blob blob : bucket.list().iterateAll()) { - gcs.delete(blob.getBlobId()); - } - } else { - logger.info("Bucket {} does not exist", bucketName); - } - } - - private static void usage() { - System.err.println( - "usage: BucketClearer " - ); - System.exit(1); - } -} diff --git a/kcbq-connector/src/integration-test/java/com/wepay/kafka/connect/bigquery/it/utils/TableClearer.java b/kcbq-connector/src/integration-test/java/com/wepay/kafka/connect/bigquery/it/utils/TableClearer.java deleted file mode 100644 index 55638d4eb..000000000 --- a/kcbq-connector/src/integration-test/java/com/wepay/kafka/connect/bigquery/it/utils/TableClearer.java +++ /dev/null @@ -1,59 +0,0 @@ -package com.wepay.kafka.connect.bigquery.it.utils; - -/* - * Copyright 2016 WePay, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - -import com.google.api.gax.paging.Page; -import com.google.cloud.bigquery.BigQuery; - -import com.google.cloud.bigquery.Table; -import com.wepay.kafka.connect.bigquery.BigQueryHelper; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class TableClearer { - private static final Logger logger = LoggerFactory.getLogger(TableClearer.class); - private static String keySource; - - - /** - * Clears tables in the given project and dataset, using a provided JSON service account key. - */ - public static void main(String[] args) { - if (args.length < 5) { - usage(); - } - if (args.length == 5) { - keySource = args[3]; - } - BigQuery bigQuery = new BigQueryHelper().setKeySource(keySource).connect(args[1], args[0]); - Page tables = bigQuery.listTables(args[2]); - logger.info("Deleting the tables in {} dataset", args[2]); - for (Table table : tables.iterateAll()) { - table.delete(); - } - } - - private static void usage() { - System.err.println( - "usage: TableClearer
[
...]" - ); - System.exit(1); - } -} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQueryHelper.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQueryHelper.java deleted file mode 100644 index 1ad008e4b..000000000 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQueryHelper.java +++ /dev/null @@ -1,106 +0,0 @@ -package com.wepay.kafka.connect.bigquery; - -/* - * Copyright 2016 WePay, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -import com.google.auth.oauth2.GoogleCredentials; -import com.google.cloud.bigquery.BigQuery; -import com.google.cloud.bigquery.BigQueryOptions; - -import com.wepay.kafka.connect.bigquery.exception.BigQueryConnectException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.ByteArrayInputStream; -import java.nio.charset.StandardCharsets; - -/** - * Convenience class for creating a default {@link com.google.cloud.bigquery.BigQuery} instance, - * with or without login credentials. - */ -public class BigQueryHelper { - private static final Logger logger = LoggerFactory.getLogger(BigQueryHelper.class); - private static String keySource; - - /** - * Returns a default {@link BigQuery} instance for the specified project with credentials provided - * in the specified file, which can then be used for creating, updating, and inserting into tables - * from specific datasets. - * - * @param projectName The name of the BigQuery project to work with - * @param key The google credentials JSON key that can be used to provide - * credentials to BigQuery, or null if no authentication should be performed. - * @return The resulting BigQuery object. - */ - public BigQuery connect(String projectName, String key) { - if (key == null) { - return connect(projectName); - } - logger.debug("Attempting to open file {} for service account json key", key); - InputStream credentialsStream; - try { - if (keySource != null && keySource.equals("JSON")) { - credentialsStream = new ByteArrayInputStream(key.getBytes(StandardCharsets.UTF_8)); - } else { - credentialsStream = new FileInputStream(key); - } - return new - BigQueryOptions.DefaultBigQueryFactory().create( - BigQueryOptions.newBuilder() - .setProjectId(projectName) - .setCredentials(GoogleCredentials.fromStream(credentialsStream)) - .build() - ); - } catch (IOException err) { - throw new BigQueryConnectException("Failed to access json key file", err); - } - } - /** - * Returns a default {@link BigQuery} instance for the specified project with credentials provided - * in the specified file, which can then be used for creating, updating, and inserting into tables - * from specific datasets. - * - * @param keySource The type of key config we can expect. This is either a String - * representation of the Google credentials file, or the path to the Google credentials file. - * @return The resulting BigQuery object. - */ - public BigQueryHelper setKeySource(String keySource) { - this.keySource = keySource; - return this; - } - - /** - * Returns a default {@link BigQuery} instance for the specified project with no authentication - * credentials, which can then be used for creating, updating, and inserting into tables from - * specific datasets. - * - * @param projectName The name of the BigQuery project to work with - * @return The resulting BigQuery object. - */ - public BigQuery connect(String projectName) { - logger.debug("Attempting to access BigQuery without authentication"); - return new BigQueryOptions.DefaultBigQueryFactory().create( - BigQueryOptions.newBuilder() - .setProjectId(projectName) - .build() - ); - } -} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQuerySinkConnector.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQuerySinkConnector.java index 2981adb8a..ae3ef6f43 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQuerySinkConnector.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQuerySinkConnector.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,19 +17,16 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery; -import com.google.cloud.bigquery.BigQuery; import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; -import com.wepay.kafka.connect.bigquery.exception.SinkConfigConnectException; - +import com.wepay.kafka.connect.bigquery.config.BigQuerySinkTaskConfig; import com.wepay.kafka.connect.bigquery.utils.Version; - +import org.apache.kafka.common.config.Config; import org.apache.kafka.common.config.ConfigDef; -import org.apache.kafka.common.config.ConfigException; - +import org.apache.kafka.common.config.ConfigValue; import org.apache.kafka.connect.connector.Task; import org.apache.kafka.connect.sink.SinkConnector; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,27 +40,6 @@ * {@link org.apache.kafka.connect.sink.SinkTask SinkTasks}. */ public class BigQuerySinkConnector extends SinkConnector { - private final BigQuery testBigQuery; - private final SchemaManager testSchemaManager; - - public static final String GCS_BQ_TASK_CONFIG_KEY = "GCSBQTask"; - - public BigQuerySinkConnector() { - testBigQuery = null; - testSchemaManager = null; - } - - // For testing purposes only; will never be called by the Kafka Connect framework - BigQuerySinkConnector(BigQuery bigQuery) { - this.testBigQuery = bigQuery; - this.testSchemaManager = null; - } - - // For testing purposes only; will never be called by the Kafka Connect framework - BigQuerySinkConnector(BigQuery bigQuery, SchemaManager schemaManager) { - this.testBigQuery = bigQuery; - this.testSchemaManager = schemaManager; - } private BigQuerySinkConfig config; private Map configProperties; @@ -73,21 +49,26 @@ public BigQuerySinkConnector() { @Override public ConfigDef config() { logger.trace("connector.config()"); - return config.getConfig(); + return BigQuerySinkConfig.getConfig(); + } + + @Override + public Config validate(Map properties) { + List singlePropertyValidations = config().validate(properties); + // If any of our properties had malformed syntax or failed a validation to ensure, e.g., that it fell within an + // acceptable numeric range, we only report those errors since they prevent us from being able to construct a + // valid BigQuerySinkConfig instance + if (singlePropertyValidations.stream().anyMatch(v -> !v.errorMessages().isEmpty())) { + return new Config(singlePropertyValidations); + } + return new BigQuerySinkConfig(properties).validate(); } @Override public void start(Map properties) { logger.trace("connector.start()"); - try { - configProperties = properties; - config = new BigQuerySinkConfig(properties); - } catch (ConfigException err) { - throw new SinkConfigConnectException( - "Couldn't start BigQuerySinkConnector due to configuration error", - err - ); - } + configProperties = properties; + config = new BigQuerySinkConfig(properties); } @Override @@ -106,12 +87,12 @@ public List> taskConfigs(int maxTasks) { logger.trace("connector.taskConfigs()"); List> taskConfigs = new ArrayList<>(); for (int i = 0; i < maxTasks; i++) { - // Copy configProperties so that tasks can't interfere with each others' configurations HashMap taskConfig = new HashMap<>(configProperties); if (i == 0 && !config.getList(BigQuerySinkConfig.ENABLE_BATCH_CONFIG).isEmpty()) { // if batch loading is enabled, configure first task to do the GCS -> BQ loading - taskConfig.put(GCS_BQ_TASK_CONFIG_KEY, "true"); + taskConfig.put(BigQuerySinkTaskConfig.GCS_BQ_TASK_CONFIG, "true"); } + taskConfig.put(BigQuerySinkTaskConfig.TASK_ID_CONFIG, Integer.toString(i)); taskConfigs.add(taskConfig); } return taskConfigs; diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQuerySinkTask.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQuerySinkTask.java index b338e27b0..39f9b59f5 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQuerySinkTask.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQuerySinkTask.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,10 +17,16 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery; import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.BigQueryException; import com.google.cloud.bigquery.InsertAllRequest.RowToInsert; +import com.google.cloud.bigquery.StandardTableDefinition; +import com.google.cloud.bigquery.Table; import com.google.cloud.bigquery.TableId; +import com.google.cloud.bigquery.TimePartitioning; +import com.google.cloud.bigquery.TimePartitioning.Type; import com.google.cloud.storage.Bucket; import com.google.cloud.storage.BucketInfo; import com.google.cloud.storage.Storage; @@ -28,44 +34,46 @@ import com.wepay.kafka.connect.bigquery.api.SchemaRetriever; import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; import com.wepay.kafka.connect.bigquery.config.BigQuerySinkTaskConfig; -import com.wepay.kafka.connect.bigquery.convert.RecordConverter; import com.wepay.kafka.connect.bigquery.convert.SchemaConverter; -import com.wepay.kafka.connect.bigquery.utils.SinkRecordConverter; -import com.wepay.kafka.connect.bigquery.exception.BigQueryConnectException; -import com.wepay.kafka.connect.bigquery.exception.SinkConfigConnectException; import com.wepay.kafka.connect.bigquery.utils.FieldNameSanitizer; import com.wepay.kafka.connect.bigquery.utils.PartitionedTableId; +import com.wepay.kafka.connect.bigquery.utils.SinkRecordConverter; +import com.wepay.kafka.connect.bigquery.utils.TableNameUtils; import com.wepay.kafka.connect.bigquery.utils.Version; import com.wepay.kafka.connect.bigquery.write.batch.GCSBatchTableWriter; import com.wepay.kafka.connect.bigquery.write.batch.KCBQThreadPoolExecutor; +import com.wepay.kafka.connect.bigquery.write.batch.MergeBatches; import com.wepay.kafka.connect.bigquery.write.batch.TableWriter; import com.wepay.kafka.connect.bigquery.write.batch.TableWriterBuilder; import com.wepay.kafka.connect.bigquery.write.row.AdaptiveBigQueryWriter; +import com.wepay.kafka.connect.bigquery.write.row.BigQueryErrorResponses; import com.wepay.kafka.connect.bigquery.write.row.BigQueryWriter; import com.wepay.kafka.connect.bigquery.write.row.GCSToBQWriter; import com.wepay.kafka.connect.bigquery.write.row.SimpleBigQueryWriter; +import com.wepay.kafka.connect.bigquery.write.row.UpsertDeleteBigQueryWriter; +import java.io.IOException; import org.apache.kafka.clients.consumer.OffsetAndMetadata; import org.apache.kafka.common.TopicPartition; import org.apache.kafka.common.config.ConfigException; import org.apache.kafka.common.record.TimestampType; import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.RetriableException; import org.apache.kafka.connect.sink.SinkRecord; import org.apache.kafka.connect.sink.SinkTask; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.time.Instant; -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.UUID; +import java.util.*; +import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; -import java.util.Optional; +import java.util.concurrent.atomic.AtomicReference; + +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.TOPIC2TABLE_MAP_CONFIG; +import static com.wepay.kafka.connect.bigquery.utils.TableNameUtils.intTable; /** * A {@link SinkTask} used to translate Kafka Connect {@link SinkRecord SinkRecords} into BigQuery @@ -74,6 +82,8 @@ public class BigQuerySinkTask extends SinkTask { private static final Logger logger = LoggerFactory.getLogger(BigQuerySinkTask.class); + private AtomicReference bigQuery; + private AtomicReference schemaManager; private SchemaRetriever schemaRetriever; private BigQueryWriter bigQueryWriter; private GCSToBQWriter gcsToBQWriter; @@ -83,6 +93,10 @@ public class BigQuerySinkTask extends SinkTask { private boolean useMessageTimeDatePartitioning; private boolean usePartitionDecorator; private boolean sanitize; + private boolean upsertDelete; + private MergeBatches mergeBatches; + private MergeQueries mergeQueries; + private volatile boolean stopped; private TopicPartitionManager topicPartitionManager; @@ -94,7 +108,10 @@ public class BigQuerySinkTask extends SinkTask { private final SchemaManager testSchemaManager; private final UUID uuid = UUID.randomUUID(); - private ScheduledExecutorService gcsLoadExecutor; + private ScheduledExecutorService loadExecutor; + + private Map cache; + private Map topic2TableMap; /** * Create a new BigquerySinkTask. @@ -115,15 +132,30 @@ public BigQuerySinkTask() { * @param testSchemaManager {@link SchemaManager} to use for testing (likely a mock) * @see BigQuerySinkTask#BigQuerySinkTask() */ - public BigQuerySinkTask(BigQuery testBigQuery, SchemaRetriever schemaRetriever, Storage testGcs, SchemaManager testSchemaManager) { + public BigQuerySinkTask(BigQuery testBigQuery, SchemaRetriever schemaRetriever, Storage testGcs, + SchemaManager testSchemaManager, Map testCache) { this.testBigQuery = testBigQuery; this.schemaRetriever = schemaRetriever; this.testGcs = testGcs; this.testSchemaManager = testSchemaManager; + this.cache = testCache; } @Override public void flush(Map offsets) { + if (upsertDelete) { + throw new ConnectException("This connector cannot perform upsert/delete on older versions of " + + "the Connect framework; please upgrade to version 0.10.2.0 or later"); + } + + // Return immediately here since the executor will already be shutdown + if (stopped) { + // Still have to check for errors in order to prevent offsets being committed for records that + // we've failed to write + executor.maybeThrowEncounteredError(); + return; + } + try { executor.awaitCurrentTasks(); } catch (InterruptedException err) { @@ -133,47 +165,79 @@ public void flush(Map offsets) { topicPartitionManager.resumeAll(); } - private void maybeEnsureExistingTable(TableId table) { - BigQuery bigQuery = getBigQuery(); - if (bigQuery.getTable(table) == null && !config.getBoolean(config.TABLE_CREATE_CONFIG)) { - throw new BigQueryConnectException("Table '" + table + "' does not exist. " + - "You may want to enable auto table creation by setting " + config.TABLE_CREATE_CONFIG - + "=true in the properties file"); + @Override + public Map preCommit(Map offsets) { + if (upsertDelete) { + Map result = mergeBatches.latestOffsets(); + checkQueueSize(); + return result; } + + flush(offsets); + return offsets; } private PartitionedTableId getRecordTable(SinkRecord record) { String tableName; - String dataset = config.getString(config.DEFAULT_DATASET_CONFIG); - String[] smtReplacement = record.topic().split(":"); - - if (smtReplacement.length == 2) { - dataset = smtReplacement[0]; - tableName = smtReplacement[1]; - } else if (smtReplacement.length == 1) { - tableName = smtReplacement[0]; + String dataset = config.getString(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG); + if (topic2TableMap != null) { + tableName = topic2TableMap.getOrDefault(record.topic(), record.topic()); } else { - throw new ConfigException("Incorrect regex replacement format. " + - "SMT replacement should either produce the : format or just the format."); - } + String[] smtReplacement = record.topic().split(":"); - if (sanitize) { - tableName = FieldNameSanitizer.sanitizeName(tableName); + if (smtReplacement.length == 2) { + dataset = smtReplacement[0]; + tableName = smtReplacement[1]; + } else if (smtReplacement.length == 1) { + tableName = smtReplacement[0]; + } else { + throw new ConnectException(String.format( + "Incorrect regex replacement format in topic name '%s'. " + + "SMT replacement should either produce the : format " + + "or just the format.", + record.topic() + )); + } + if (sanitize) { + tableName = FieldNameSanitizer.sanitizeName(tableName); + } } + + // TODO: Order of execution of topic/table name modifications => + // regex router SMT modifies topic name in sinkrecord. + // It could be either : separated or not. + + // should we use topic2table map with sanitize table name? doesn't make sense. + + // we use table name from above to sanitize table name further. + + TableId baseTableId = TableId.of(dataset, tableName); - maybeEnsureExistingTable(baseTableId); + if (upsertDelete) { + TableId intermediateTableId = mergeBatches.intermediateTableFor(baseTableId); + // If upsert/delete is enabled, we want to stream into a non-partitioned intermediate table + return new PartitionedTableId.Builder(intermediateTableId).build(); + } PartitionedTableId.Builder builder = new PartitionedTableId.Builder(baseTableId); if (usePartitionDecorator) { + Table bigQueryTable = retrieveCachedTable(baseTableId); + TimePartitioning timePartitioning = TimePartitioning.of(Type.DAY); + if (bigQueryTable != null) { + StandardTableDefinition standardTableDefinition = bigQueryTable.getDefinition(); + if (standardTableDefinition != null && standardTableDefinition.getTimePartitioning() != null) { + timePartitioning = standardTableDefinition.getTimePartitioning(); + } + } if (useMessageTimeDatePartitioning) { if (record.timestampType() == TimestampType.NO_TIMESTAMP_TYPE) { throw new ConnectException( "Message has no timestamp type, cannot use message timestamp to partition."); } - builder.setDayPartition(record.timestamp()); + setTimePartitioningForTimestamp(baseTableId, builder, timePartitioning, record.timestamp()); } else { - builder.setDayPartitionForNow(); + setTimePartitioning(baseTableId, builder, timePartitioning); } } @@ -182,36 +246,45 @@ private PartitionedTableId getRecordTable(SinkRecord record) { @Override public void put(Collection records) { - logger.info("Putting {} records in the sink.", records.size()); + // Periodically poll for errors here instead of doing a stop-the-world check in flush() + executor.maybeThrowEncounteredError(); + + logger.debug("Putting {} records in the sink.", records.size()); // create tableWriters Map tableWriterBuilders = new HashMap<>(); for (SinkRecord record : records) { - if (record.value() != null) { + if (record.value() != null || config.getBoolean(BigQuerySinkConfig.DELETE_ENABLED_CONFIG)) { PartitionedTableId table = getRecordTable(record); if (!tableWriterBuilders.containsKey(table)) { TableWriterBuilder tableWriterBuilder; - if (config.getList(config.ENABLE_BATCH_CONFIG).contains(record.topic())) { + if (config.getList(BigQuerySinkConfig.ENABLE_BATCH_CONFIG).contains(record.topic())) { String topic = record.topic(); - String gcsBlobName = topic + "_" + uuid + "_" + Instant.now().toEpochMilli(); - String gcsFolderName = config.getString(config.GCS_FOLDER_NAME_CONFIG); + long offset = record.kafkaOffset(); + String gcsBlobName = topic + "_" + uuid + "_" + Instant.now().toEpochMilli() + "_" + offset; + String gcsFolderName = config.getString(BigQuerySinkConfig.GCS_FOLDER_NAME_CONFIG); if (gcsFolderName != null && !"".equals(gcsFolderName)) { gcsBlobName = gcsFolderName + "/" + gcsBlobName; } tableWriterBuilder = new GCSBatchTableWriter.Builder( gcsToBQWriter, table.getBaseTableId(), - config.getString(config.GCS_BUCKET_NAME_CONFIG), + config.getString(BigQuerySinkConfig.GCS_BUCKET_NAME_CONFIG), gcsBlobName, recordConverter); } else { - tableWriterBuilder = + TableWriter.Builder simpleTableWriterBuilder = new TableWriter.Builder(bigQueryWriter, table, recordConverter); + if (upsertDelete) { + simpleTableWriterBuilder.onFinish(rows -> + mergeBatches.onRowWrites(table.getBaseTableId(), rows)); + } + tableWriterBuilder = simpleTableWriterBuilder; } tableWriterBuilders.put(table, tableWriterBuilder); } - tableWriterBuilders.get(table).addRow(record); + tableWriterBuilders.get(table).addRow(record, table.getBaseTableId()); } } @@ -221,7 +294,19 @@ public void put(Collection records) { } // check if we should pause topics - long queueSoftLimit = config.getLong(BigQuerySinkTaskConfig.QUEUE_SIZE_CONFIG); + checkQueueSize(); + } + + // Important: this method is only safe to call during put(), flush(), or preCommit(); otherwise, + // a ConcurrentModificationException may be triggered if the Connect framework is in the middle of + // a method invocation on the consumer for this task. This becomes especially likely if all topics + // have been paused as the framework will most likely be in the middle of a poll for that consumer + // which, because all of its topics have been paused, will not return until it's time for the next + // offset commit. Invoking context.requestCommit() won't wake up the consumer in that case, so we + // really have no choice but to wait for the framework to call a method on this task that implies + // that it's safe to pause or resume partitions on the consumer. + private void checkQueueSize() { + long queueSoftLimit = config.getLong(BigQuerySinkConfig.QUEUE_SIZE_CONFIG); if (queueSoftLimit != -1) { int currentQueueSize = executor.getQueue().size(); if (currentQueueSize > queueSoftLimit) { @@ -237,39 +322,100 @@ private BigQuery getBigQuery() { if (testBigQuery != null) { return testBigQuery; } - String projectName = config.getString(config.PROJECT_CONFIG); - String keyFile = config.getKeyFile(); - String keySource = config.getString(config.KEY_SOURCE_CONFIG); - return new BigQueryHelper().setKeySource(keySource).connect(projectName, keyFile); + return bigQuery.updateAndGet(bq -> bq != null ? bq : newBigQuery()); + } + + private void setTimePartitioningForTimestamp( + TableId table, PartitionedTableId.Builder builder, TimePartitioning timePartitioning, Long timestamp + ) { + if (timePartitioning.getType() != Type.DAY) { + throw new ConnectException(String.format( + "Cannot use decorator syntax to write to %s as it is partitioned by %s and not by day", + TableNameUtils.table(table), + timePartitioning.getType().toString().toLowerCase() + )); + } + builder.setDayPartition(timestamp); + } + + private void setTimePartitioning(TableId table, PartitionedTableId.Builder builder, TimePartitioning timePartitioning) { + if (timePartitioning.getType() != Type.DAY) { + throw new ConnectException(String.format( + "Cannot use decorator syntax to write to %s as it is partitioned by %s and not by day", + TableNameUtils.table(table), + timePartitioning.getType().toString().toLowerCase() + )); + } + builder.setDayPartitionForNow(); + } + + private Table retrieveCachedTable(TableId tableId) { + return getCache().computeIfAbsent(tableId, this::retrieveTable); + } + + private Table retrieveTable(TableId tableId) { + try { + return getBigQuery().getTable(tableId); + } catch (BigQueryException e) { + if (BigQueryErrorResponses.isIOError(e)) { + throw new RetriableException("Failed to retrieve information for table " + tableId, e); + } else { + throw e; + } + } } - private SchemaManager getSchemaManager(BigQuery bigQuery) { + private BigQuery newBigQuery() { + return new GcpClientBuilder.BigQueryBuilder() + .withConfig(config) + .build(); + } + + private SchemaManager getSchemaManager() { if (testSchemaManager != null) { return testSchemaManager; } + return schemaManager.updateAndGet(sm -> sm != null ? sm : newSchemaManager()); + } + + private SchemaManager newSchemaManager() { schemaRetriever = config.getSchemaRetriever(); SchemaConverter schemaConverter = config.getSchemaConverter(); Optional kafkaKeyFieldName = config.getKafkaKeyFieldName(); Optional kafkaDataFieldName = config.getKafkaDataFieldName(); Optional timestampPartitionFieldName = config.getTimestampPartitionFieldName(); - Optional> clusteringFieldName = config.getClusteringPartitionFieldName(); - boolean allowNewBQFields = config.getBoolean(config.ALLOW_NEW_BIGQUERY_FIELDS_CONFIG); - boolean allowReqFieldRelaxation = config.getBoolean(config.ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG); - return new SchemaManager(schemaRetriever, schemaConverter, bigQuery, allowNewBQFields, allowReqFieldRelaxation, kafkaKeyFieldName, - kafkaDataFieldName, timestampPartitionFieldName, clusteringFieldName); + Optional partitionExpiration = config.getPartitionExpirationMs(); + Optional> clusteringFieldName = config.getClusteringPartitionFieldNames(); + Optional timePartitioningType = config.getTimePartitioningType(); + boolean allowNewBQFields = config.getBoolean(BigQuerySinkConfig.ALLOW_NEW_BIGQUERY_FIELDS_CONFIG); + boolean allowReqFieldRelaxation = config.getBoolean(BigQuerySinkConfig.ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG); + boolean allowSchemaUnionization = config.getBoolean(BigQuerySinkConfig.ALLOW_SCHEMA_UNIONIZATION_CONFIG); + boolean sanitizeFieldNames = config.getBoolean(BigQuerySinkConfig.SANITIZE_FIELD_NAME_CONFIG); + return new SchemaManager(schemaRetriever, schemaConverter, getBigQuery(), + allowNewBQFields, allowReqFieldRelaxation, allowSchemaUnionization, + sanitizeFieldNames, + kafkaKeyFieldName, kafkaDataFieldName, + timestampPartitionFieldName, partitionExpiration, clusteringFieldName, timePartitioningType); } private BigQueryWriter getBigQueryWriter() { - boolean autoCreateTables = config.getBoolean(config.TABLE_CREATE_CONFIG); - boolean allowNewBigQueryFields = config.getBoolean(config.ALLOW_NEW_BIGQUERY_FIELDS_CONFIG); - boolean allowRequiredFieldRelaxation = config.getBoolean(config.ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG); - int retry = config.getInt(config.BIGQUERY_RETRY_CONFIG); - long retryWait = config.getLong(config.BIGQUERY_RETRY_WAIT_CONFIG); + boolean autoCreateTables = config.getBoolean(BigQuerySinkConfig.TABLE_CREATE_CONFIG); + boolean allowNewBigQueryFields = config.getBoolean(BigQuerySinkConfig.ALLOW_NEW_BIGQUERY_FIELDS_CONFIG); + boolean allowRequiredFieldRelaxation = config.getBoolean(BigQuerySinkConfig.ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG); + int retry = config.getInt(BigQuerySinkConfig.BIGQUERY_RETRY_CONFIG); + long retryWait = config.getLong(BigQuerySinkConfig.BIGQUERY_RETRY_WAIT_CONFIG); BigQuery bigQuery = getBigQuery(); - if (autoCreateTables || allowNewBigQueryFields || allowRequiredFieldRelaxation ) { + if (upsertDelete) { + return new UpsertDeleteBigQueryWriter(bigQuery, + getSchemaManager(), + retry, + retryWait, + autoCreateTables, + mergeBatches.intermediateToDestinationTables()); + } else if (autoCreateTables || allowNewBigQueryFields || allowRequiredFieldRelaxation) { return new AdaptiveBigQueryWriter(bigQuery, - getSchemaManager(bigQuery), + getSchemaManager(), retry, retryWait, autoCreateTables); @@ -282,21 +428,19 @@ private Storage getGcs() { if (testGcs != null) { return testGcs; } - String projectName = config.getString(config.PROJECT_CONFIG); - String key = config.getKeyFile(); - String keySource = config.getString(config.KEY_SOURCE_CONFIG); - return new GCSBuilder(projectName).setKey(key).setKeySource(keySource).build(); - + return new GcpClientBuilder.GcsBuilder() + .withConfig(config) + .build(); } private GCSToBQWriter getGcsWriter() { BigQuery bigQuery = getBigQuery(); - int retry = config.getInt(config.BIGQUERY_RETRY_CONFIG); - long retryWait = config.getLong(config.BIGQUERY_RETRY_WAIT_CONFIG); - boolean autoCreateTables = config.getBoolean(config.TABLE_CREATE_CONFIG); + int retry = config.getInt(BigQuerySinkConfig.BIGQUERY_RETRY_CONFIG); + long retryWait = config.getLong(BigQuerySinkConfig.BIGQUERY_RETRY_WAIT_CONFIG); + boolean autoCreateTables = config.getBoolean(BigQuerySinkConfig.TABLE_CREATE_CONFIG); // schemaManager shall only be needed for creating table hence do not fetch instance if not // needed. - SchemaManager schemaManager = autoCreateTables ? getSchemaManager(bigQuery) : null; + SchemaManager schemaManager = autoCreateTables ? getSchemaManager() : null; return new GCSToBQWriter(getGcs(), bigQuery, schemaManager, @@ -305,86 +449,138 @@ private GCSToBQWriter getGcsWriter() { autoCreateTables); } - private SinkRecordConverter getConverter(BigQuerySinkConfig config) { - return new SinkRecordConverter(config.getRecordConverter(), - config.getBoolean(config.SANITIZE_FIELD_NAME_CONFIG), - config.getKafkaKeyFieldName(), - config.getKafkaDataFieldName()); + private SinkRecordConverter getConverter(BigQuerySinkTaskConfig config) { + return new SinkRecordConverter(config, mergeBatches, mergeQueries); + } + + private synchronized Map getCache() { + if (cache == null) { + cache = new HashMap<>(); + } + + return cache; } @Override public void start(Map properties) { logger.trace("task.start()"); - final boolean hasGCSBQTask = - properties.remove(BigQuerySinkConnector.GCS_BQ_TASK_CONFIG_KEY) != null; - try { - config = new BigQuerySinkTaskConfig(properties); - } catch (ConfigException err) { - throw new SinkConfigConnectException( - "Couldn't start BigQuerySinkTask due to configuration error", - err + stopped = false; + config = new BigQuerySinkTaskConfig(properties); + + upsertDelete = config.getBoolean(BigQuerySinkConfig.UPSERT_ENABLED_CONFIG) + || config.getBoolean(BigQuerySinkConfig.DELETE_ENABLED_CONFIG); + + bigQuery = new AtomicReference<>(); + schemaManager = new AtomicReference<>(); + + if (upsertDelete) { + String intermediateTableSuffix = String.format("_%s_%d_%s_%d", + config.getString(BigQuerySinkConfig.INTERMEDIATE_TABLE_SUFFIX_CONFIG), + config.getInt(BigQuerySinkTaskConfig.TASK_ID_CONFIG), + uuid, + Instant.now().toEpochMilli() ); + mergeBatches = new MergeBatches(intermediateTableSuffix); } + cache = getCache(); bigQueryWriter = getBigQueryWriter(); gcsToBQWriter = getGcsWriter(); - recordConverter = getConverter(config); executor = new KCBQThreadPoolExecutor(config, new LinkedBlockingQueue<>()); topicPartitionManager = new TopicPartitionManager(); useMessageTimeDatePartitioning = - config.getBoolean(config.BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG); + config.getBoolean(BigQuerySinkConfig.BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG); usePartitionDecorator = - config.getBoolean(config.BIGQUERY_PARTITION_DECORATOR_CONFIG); + config.getBoolean(BigQuerySinkConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG); sanitize = config.getBoolean(BigQuerySinkConfig.SANITIZE_TOPICS_CONFIG); - if (hasGCSBQTask) { + if (config.getBoolean(BigQuerySinkTaskConfig.GCS_BQ_TASK_CONFIG)) { startGCSToBQLoadTask(); + } else if (upsertDelete) { + mergeQueries = + new MergeQueries(config, mergeBatches, executor, getBigQuery(), getSchemaManager(), context); + maybeStartMergeFlushTask(); } + + recordConverter = getConverter(config); + topic2TableMap = config.getTopic2TableMap().orElse(null); } private void startGCSToBQLoadTask() { logger.info("Attempting to start GCS Load Executor."); - gcsLoadExecutor = Executors.newScheduledThreadPool(1); - String bucketName = config.getString(config.GCS_BUCKET_NAME_CONFIG); + loadExecutor = Executors.newScheduledThreadPool(1); + String bucketName = config.getString(BigQuerySinkConfig.GCS_BUCKET_NAME_CONFIG); Storage gcs = getGcs(); // get the bucket, or create it if it does not exist. Bucket bucket = gcs.get(bucketName); if (bucket == null) { // todo here is where we /could/ set a retention policy for the bucket, // but for now I don't think we want to do that. - if (config.getBoolean(config.AUTO_CREATE_BUCKET_CONFIG)) { + if (config.getBoolean(BigQuerySinkConfig.AUTO_CREATE_BUCKET_CONFIG)) { BucketInfo bucketInfo = BucketInfo.of(bucketName); bucket = gcs.create(bucketInfo); } else { - throw new ConnectException("Bucket '" + bucketName + "' does not exist; Create the bucket manually, or set '" + config.AUTO_CREATE_BUCKET_CONFIG + "' to true"); + throw new ConnectException(String.format( + "Bucket '%s' does not exist; Create the bucket manually, or set '%s' to true", + bucketName, + BigQuerySinkConfig.AUTO_CREATE_BUCKET_CONFIG + )); } } GCSToBQLoadRunnable loadRunnable = new GCSToBQLoadRunnable(getBigQuery(), bucket); int intervalSec = config.getInt(BigQuerySinkConfig.BATCH_LOAD_INTERVAL_SEC_CONFIG); - gcsLoadExecutor.scheduleAtFixedRate(loadRunnable, intervalSec, intervalSec, TimeUnit.SECONDS); + loadExecutor.scheduleAtFixedRate(loadRunnable, intervalSec, intervalSec, TimeUnit.SECONDS); + } + + private void maybeStartMergeFlushTask() { + long intervalMs = config.getLong(BigQuerySinkConfig.MERGE_INTERVAL_MS_CONFIG); + if (intervalMs == -1) { + logger.info("{} is set to -1; periodic merge flushes are disabled", BigQuerySinkConfig.MERGE_INTERVAL_MS_CONFIG); + return; + } + logger.info("Attempting to start upsert/delete load executor"); + loadExecutor = Executors.newScheduledThreadPool(1); + loadExecutor.scheduleAtFixedRate( + mergeQueries::mergeFlushAll, intervalMs, intervalMs, TimeUnit.MILLISECONDS); } @Override public void stop() { try { - executor.shutdown(); - executor.awaitTermination(EXECUTOR_SHUTDOWN_TIMEOUT_SEC, TimeUnit.SECONDS); - if (gcsLoadExecutor != null) { - try { - logger.info("Attempting to shut down GCS Load Executor."); - gcsLoadExecutor.shutdown(); - gcsLoadExecutor.awaitTermination(EXECUTOR_SHUTDOWN_TIMEOUT_SEC, TimeUnit.SECONDS); - } catch (InterruptedException ex) { - logger.warn("Could not shut down GCS Load Executor within {}s.", - EXECUTOR_SHUTDOWN_TIMEOUT_SEC); - } + maybeStopExecutor(loadExecutor, "load executor"); + maybeStopExecutor(executor, "table write executor"); + if (upsertDelete) { + mergeBatches.intermediateTables().forEach(table -> { + logger.debug("Deleting {}", intTable(table)); + getBigQuery().delete(table); + }); } - } catch (InterruptedException ex) { - logger.warn("{} active threads are still executing tasks {}s after shutdown is signaled.", - executor.getActiveCount(), EXECUTOR_SHUTDOWN_TIMEOUT_SEC); } finally { - logger.trace("task.stop()"); + stopped = true; + } + + logger.trace("task.stop()"); + } + + private void maybeStopExecutor(ExecutorService executor, String executorName) { + if (executor == null) { + return; + } + + try { + if (upsertDelete) { + logger.trace("Forcibly shutting down {}", executorName); + executor.shutdownNow(); + } else { + logger.trace("Requesting shutdown for {}", executorName); + executor.shutdown(); + } + logger.trace("Awaiting termination of {}", executorName); + executor.awaitTermination(EXECUTOR_SHUTDOWN_TIMEOUT_SEC, TimeUnit.SECONDS); + logger.trace("Shut down {} successfully", executorName); + } catch (Exception e) { + logger.warn("Failed to shut down {}", executorName, e); } } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/GCSBuilder.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/GCSBuilder.java deleted file mode 100644 index e2dd3ed37..000000000 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/GCSBuilder.java +++ /dev/null @@ -1,107 +0,0 @@ -package com.wepay.kafka.connect.bigquery; - -/* - * Copyright 2016 WePay, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -import com.google.auth.oauth2.GoogleCredentials; -import com.google.cloud.storage.Storage; -import com.google.cloud.storage.StorageOptions; - -import com.wepay.kafka.connect.bigquery.exception.GCSConnectException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.ByteArrayInputStream; -import java.nio.charset.StandardCharsets; - -/** - * Convenience class for creating a {@link com.google.cloud.storage.Storage} instance - */ -public class GCSBuilder { - private static final Logger logger = LoggerFactory.getLogger(GCSBuilder.class); - - private final String projectName; - private String key; - private String keySource; - - public GCSBuilder(String projectName) { - this.projectName = projectName; - this.key = null; - } - - public GCSBuilder setKeySource(String keySourceType) { - this.keySource = keySourceType; - return this; - } - - public GCSBuilder setKey(String keyFile) { - this.key = keyFile; - return this; - } - public Storage build() { - return connect(projectName, key); - } - - /** - * Returns a default {@link Storage} instance for the specified project with credentials provided - * in the specified file. - * - * @param projectName The name of the GCS project to work with - * @param key The name of a file containing a JSON key that can be used to provide - * credentials to GCS, or null if no authentication should be performed. - * @return The resulting Storage object. - */ - private Storage connect(String projectName, String key) { - if (key == null) { - return connect(projectName); - } - try { - InputStream credentialsStream; - if (keySource != null && keySource.equals("JSON")) { - credentialsStream = new ByteArrayInputStream(key.getBytes(StandardCharsets.UTF_8)); - } else { - credentialsStream = new FileInputStream(key); - } - return StorageOptions.newBuilder() - .setProjectId(projectName) - .setCredentials(GoogleCredentials.fromStream(credentialsStream)) - .build() - .getService(); - } catch (IOException err) { - throw new GCSConnectException("Failed to access json key file", err); - } - } - - /** - * Returns a default {@link Storage} instance for the specified project with no authentication - * credentials. - * - * @param projectName The name of the GCS project to work with - * @return The resulting Storage object. - */ - private Storage connect(String projectName) { - logger.debug("Attempting to access BigQuery without authentication"); - return StorageOptions.newBuilder() - .setProjectId(projectName) - .build() - .getService(); - } -} - diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/GCSToBQLoadRunnable.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/GCSToBQLoadRunnable.java index 348fccb29..aa42c8e6c 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/GCSToBQLoadRunnable.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/GCSToBQLoadRunnable.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery; + import com.google.api.gax.paging.Page; import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.BigQueryException; diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/GcpClientBuilder.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/GcpClientBuilder.java new file mode 100644 index 000000000..a24a897a4 --- /dev/null +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/GcpClientBuilder.java @@ -0,0 +1,180 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery; + +import com.google.api.gax.rpc.FixedHeaderProvider; +import com.google.api.gax.rpc.HeaderProvider; +import com.google.auth.oauth2.GoogleCredentials; +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.BigQueryOptions; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.StorageOptions; +import com.google.common.annotations.VisibleForTesting; +import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; +import com.wepay.kafka.connect.bigquery.exception.BigQueryConnectException; +import com.wepay.kafka.connect.bigquery.utils.Version; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.Objects; + +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.CONNECTOR_RUNTIME_PROVIDER_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.CONNECTOR_RUNTIME_PROVIDER_DEFAULT; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.CONNECTOR_RUNTIME_PROVIDER_TYPES; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.PROJECT_CONFIG; + +public abstract class GcpClientBuilder { + + public enum KeySource { + FILE, JSON, APPLICATION_DEFAULT + } + + private static final Logger logger = LoggerFactory.getLogger(GcpClientBuilder.class); + private static final String USER_AGENT_HEADER_KEY = "user-agent"; + private static final String USER_AGENT_HEADER_FORMAT = "%s (GPN: Confluent;) Google BigQuery Sink/%s"; + private HeaderProvider headerProvider = null; + private String project = null; + private KeySource keySource = null; + private String key = null; + + public GcpClientBuilder withConfig(BigQuerySinkConfig config) { + return withProject(config.getString(PROJECT_CONFIG)) + .withKeySource(config.getKeySource()) + .withKey(config.getKey()) + .withUserAgent(config.getString(CONNECTOR_RUNTIME_PROVIDER_CONFIG)); + } + + public GcpClientBuilder withProject(String project) { + Objects.requireNonNull(project, "Project cannot be null"); + this.project = project; + return this; + } + + public GcpClientBuilder withKeySource(KeySource keySource) { + Objects.requireNonNull(keySource, "Key cannot be null"); + this.keySource = keySource; + return this; + } + + public GcpClientBuilder withKey(String key) { + this.key = key; + return this; + } + + public GcpClientBuilder withUserAgent(String userAgent) { + if (!CONNECTOR_RUNTIME_PROVIDER_TYPES.contains(userAgent)) { + logger.warn(String.format("Invalid Kafka runtime provider value received. Provider : %s. Defaulting to %s", + userAgent, CONNECTOR_RUNTIME_PROVIDER_DEFAULT)); + userAgent = CONNECTOR_RUNTIME_PROVIDER_DEFAULT; + } + this.headerProvider = FixedHeaderProvider.create(USER_AGENT_HEADER_KEY, + String.format(USER_AGENT_HEADER_FORMAT, userAgent, Version.version())); + return this; + } + + public Client build() { + return doBuild(project, credentials(), headerProvider); + } + + private GoogleCredentials credentials() { + if (key == null && keySource != KeySource.APPLICATION_DEFAULT) { + return null; + } + + Objects.requireNonNull(keySource, "Key source must be defined to build a GCP client"); + Objects.requireNonNull(project, "Project must be defined to build a GCP client"); + + InputStream credentialsStream; + switch (keySource) { + case JSON: + credentialsStream = new ByteArrayInputStream(key.getBytes(StandardCharsets.UTF_8)); + break; + case FILE: + try { + logger.debug("Attempting to open file {} for service account json key", key); + credentialsStream = new FileInputStream(key); + } catch (IOException e) { + throw new BigQueryConnectException("Failed to access JSON key file", e); + } + break; + case APPLICATION_DEFAULT: + try { + logger.debug("Attempting to use application default credentials"); + return GoogleCredentials.getApplicationDefault(); + } catch (IOException e) { + throw new BigQueryConnectException("Failed to create Application Default Credentials", e); + } + default: + throw new IllegalArgumentException("Unexpected value for KeySource enum: " + keySource); + } + + try { + return GoogleCredentials.fromStream(credentialsStream); + } catch (IOException e) { + throw new BigQueryConnectException("Failed to create credentials from input stream", e); + } + } + + protected abstract Client doBuild(String project, GoogleCredentials credentials, HeaderProvider userAgent); + + public static class BigQueryBuilder extends GcpClientBuilder { + @Override + protected BigQuery doBuild(String project, GoogleCredentials credentials, HeaderProvider headerProvider) { + BigQueryOptions.Builder builder = BigQueryOptions.newBuilder() + .setProjectId(project) + .setHeaderProvider(headerProvider); + + if (credentials != null) { + builder.setCredentials(credentials); + } else { + logger.debug("Attempting to access BigQuery without authentication"); + } + + return builder.build().getService(); + } + } + + public static class GcsBuilder extends GcpClientBuilder { + @Override + protected Storage doBuild(String project, GoogleCredentials credentials, HeaderProvider headerProvider) { + StorageOptions.Builder builder = StorageOptions.newBuilder() + .setProjectId(project) + .setHeaderProvider(headerProvider); + + if (credentials != null) { + builder.setCredentials(credentials); + } else { + logger.debug("Attempting to access GCS without authentication"); + } + + return builder.build().getService(); + } + } + + @VisibleForTesting + HeaderProvider getHeaderProvider() { + return this.headerProvider; + } +} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/MergeQueries.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/MergeQueries.java new file mode 100644 index 000000000..d8a251841 --- /dev/null +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/MergeQueries.java @@ -0,0 +1,465 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.FieldList; +import com.google.cloud.bigquery.QueryJobConfiguration; +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.TableId; +import com.google.common.annotations.VisibleForTesting; +import com.wepay.kafka.connect.bigquery.config.BigQuerySinkTaskConfig; +import com.wepay.kafka.connect.bigquery.exception.ExpectedInterruptException; +import com.wepay.kafka.connect.bigquery.write.batch.KCBQThreadPoolExecutor; +import com.wepay.kafka.connect.bigquery.write.batch.MergeBatches; +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.sink.SinkTaskContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static com.wepay.kafka.connect.bigquery.utils.TableNameUtils.destTable; +import static com.wepay.kafka.connect.bigquery.utils.TableNameUtils.intTable; + +public class MergeQueries { + public static final String INTERMEDIATE_TABLE_KEY_FIELD_NAME = "key"; + public static final String INTERMEDIATE_TABLE_VALUE_FIELD_NAME = "value"; + public static final String INTERMEDIATE_TABLE_ITERATION_FIELD_NAME = "i"; + public static final String INTERMEDIATE_TABLE_PARTITION_TIME_FIELD_NAME = "partitionTime"; + public static final String INTERMEDIATE_TABLE_BATCH_NUMBER_FIELD = "batchNumber"; + + private static final Logger logger = LoggerFactory.getLogger(MergeQueries.class); + + private final String keyFieldName; + private final boolean insertPartitionTime; + private final boolean upsertEnabled; + private final boolean deleteEnabled; + private final MergeBatches mergeBatches; + private final KCBQThreadPoolExecutor executor; + private final BigQuery bigQuery; + private final SchemaManager schemaManager; + private final SinkTaskContext context; + + public MergeQueries(BigQuerySinkTaskConfig config, + MergeBatches mergeBatches, + KCBQThreadPoolExecutor executor, + BigQuery bigQuery, + SchemaManager schemaManager, + SinkTaskContext context) { + this( + config.getKafkaKeyFieldName().orElseThrow(() -> + new ConnectException("Kafka key field must be configured when upsert/delete is enabled") + ), + config.getBoolean(config.BIGQUERY_PARTITION_DECORATOR_CONFIG), + config.getBoolean(config.UPSERT_ENABLED_CONFIG), + config.getBoolean(config.DELETE_ENABLED_CONFIG), + mergeBatches, + executor, + bigQuery, + schemaManager, + context + ); + } + + @VisibleForTesting + MergeQueries(String keyFieldName, + boolean insertPartitionTime, + boolean upsertEnabled, + boolean deleteEnabled, + MergeBatches mergeBatches, + KCBQThreadPoolExecutor executor, + BigQuery bigQuery, + SchemaManager schemaManager, + SinkTaskContext context) { + this.keyFieldName = keyFieldName; + this.insertPartitionTime = insertPartitionTime; + this.upsertEnabled = upsertEnabled; + this.deleteEnabled = deleteEnabled; + this.mergeBatches = mergeBatches; + this.executor = executor; + this.bigQuery = bigQuery; + this.schemaManager = schemaManager; + this.context = context; + } + + public void mergeFlushAll() { + logger.debug("Triggering merge flush for all tables"); + mergeBatches.intermediateTables().forEach(this::mergeFlush); + } + + public void mergeFlush(TableId intermediateTable) { + final TableId destinationTable = mergeBatches.destinationTableFor(intermediateTable); + final int batchNumber = mergeBatches.incrementBatch(intermediateTable); + logger.trace("Triggering merge flush from {} to {} for batch {}", + intTable(intermediateTable), destTable(destinationTable), batchNumber); + + executor.execute(() -> { + try { + mergeFlush(intermediateTable, destinationTable, batchNumber); + } catch (InterruptedException e) { + throw new ExpectedInterruptException(String.format( + "Interrupted while performing merge flush of batch %d from %s to %s", + batchNumber, intTable(intermediateTable), destTable(destinationTable))); + } + }); + } + + private void mergeFlush( + TableId intermediateTable, TableId destinationTable, int batchNumber + ) throws InterruptedException{ + // If there are rows to flush in this batch, flush them + if (mergeBatches.prepareToFlush(intermediateTable, batchNumber)) { + logger.debug("Running merge query on batch {} from {}", + batchNumber, intTable(intermediateTable)); + String mergeFlushQuery = mergeFlushQuery(intermediateTable, destinationTable, batchNumber); + logger.trace(mergeFlushQuery); + bigQuery.query(QueryJobConfiguration.of(mergeFlushQuery)); + logger.trace("Merge from {} to {} completed", + intTable(intermediateTable), destTable(destinationTable)); + + logger.debug("Recording flush success for batch {} from {}", + batchNumber, intTable(intermediateTable)); + mergeBatches.recordSuccessfulFlush(intermediateTable, batchNumber); + + // Commit those offsets ASAP + context.requestCommit(); + + logger.info("Completed merge flush of batch {} from {} to {}", + batchNumber, intTable(intermediateTable), destTable(destinationTable)); + } + + // After, regardless of whether we flushed or not, clean up old batches from the intermediate + // table. Some rows may be several batches old but still in the table if they were in the + // streaming buffer during the last purge. + logger.trace("Clearing batches from {} on back from {}", batchNumber, intTable(intermediateTable)); + String batchClearQuery = batchClearQuery(intermediateTable, batchNumber); + logger.trace(batchClearQuery); + bigQuery.query(QueryJobConfiguration.of(batchClearQuery)); + } + + @VisibleForTesting + String mergeFlushQuery(TableId intermediateTable, TableId destinationTable, int batchNumber) { + Schema intermediateSchema = schemaManager.cachedSchema(intermediateTable); + + if (upsertEnabled && deleteEnabled) { + return upsertDeleteMergeFlushQuery(intermediateTable, destinationTable, batchNumber, intermediateSchema); + } else if (upsertEnabled) { + return upsertMergeFlushQuery(intermediateTable, destinationTable, batchNumber, intermediateSchema); + } else if (deleteEnabled) { + return deleteMergeFlushQuery(intermediateTable, destinationTable, batchNumber, intermediateSchema); + } else { + throw new IllegalStateException("At least one of upsert or delete must be enabled for merge flushing to occur."); + } + } + + /* + MERGE ``.`` + USING ( + SELECT * FROM ( + SELECT ARRAY_AGG( + x ORDER BY i DESC LIMIT 1 + )[OFFSET(0)] src + FROM ``.`` x + WHERE batchNumber= + GROUP BY key.[, key....] + ) + ) + ON ``.=src.key + WHEN MATCHED AND src.value IS NOT NULL + THEN UPDATE SET ``=src.value.[, ``=src.value....] + WHEN MATCHED AND src.value IS NULL + THEN DELETE + WHEN NOT MATCHED AND src.value IS NOT NULL + THEN INSERT (``, [_PARTITIONTIME, ]``[, ``]) + VALUES ( + src.key, + [CAST(CAST(DATE(src.partitionTime) AS DATE) AS TIMESTAMP),] + src.value.[, src.value....] + ); + */ + private String upsertDeleteMergeFlushQuery( + TableId intermediateTable, TableId destinationTable, int batchNumber, Schema intermediateSchema + ) { + List keyFields = listFields( + intermediateSchema.getFields().get(INTERMEDIATE_TABLE_KEY_FIELD_NAME).getSubFields(), + INTERMEDIATE_TABLE_KEY_FIELD_NAME + "." + ); + + List valueColumns = valueColumns(intermediateSchema); + + final String key = INTERMEDIATE_TABLE_KEY_FIELD_NAME; + final String i = INTERMEDIATE_TABLE_ITERATION_FIELD_NAME; + final String value = INTERMEDIATE_TABLE_VALUE_FIELD_NAME; + final String batch = INTERMEDIATE_TABLE_BATCH_NUMBER_FIELD; + + return "MERGE " + table(destinationTable) + " " + + "USING (" + + "SELECT * FROM (" + + "SELECT ARRAY_AGG(" + + "x ORDER BY " + i + " DESC LIMIT 1" + + ")[OFFSET(0)] src " + + "FROM " + table(intermediateTable) + " x " + + "WHERE " + batch + "=" + batchNumber + " " + + "GROUP BY " + String.join(", ", keyFields) + + ")" + + ") " + + "ON `" + destinationTable.getTable() + "`." + keyFieldName + "=src." + key + " " + + "WHEN MATCHED AND src." + value + " IS NOT NULL " + + "THEN UPDATE SET " + valueColumns.stream().map(col -> "`" + col + "`=src." + value + "." + col).collect(Collectors.joining(", ")) + " " + + "WHEN MATCHED AND src." + value + " IS NULL " + + "THEN DELETE " + + "WHEN NOT MATCHED AND src." + value + " IS NOT NULL " + + "THEN INSERT (`" + + keyFieldName + "`, " + + partitionTimePseudoColumn() + + "`" + + String.join("`, `", valueColumns) + "`) " + + "VALUES (" + + "src." + key + ", " + + partitionTimeValue() + + valueColumns.stream().map(col -> "src." + value + "." + col).collect(Collectors.joining(", ")) + + ");"; + } + + /* + MERGE ``.`` + USING ( + SELECT * FROM ( + SELECT ARRAY_AGG( + x ORDER BY i DESC LIMIT 1 + )[OFFSET(0)] src + FROM ``.`` x + WHERE batchNumber= + GROUP BY key.[, key....] + ) + ) + ON ``.=src.key + WHEN MATCHED + THEN UPDATE SET ``=src.value.[, ``=src.value....] + WHEN NOT MATCHED + THEN INSERT (``, [_PARTITIONTIME, ]``[, ``]) + VALUES ( + src.key, + [CAST(CAST(DATE(src.partitionTime) AS DATE) AS TIMESTAMP),] + src.value.[, src.value....] + ); + */ + private String upsertMergeFlushQuery( + TableId intermediateTable, TableId destinationTable, int batchNumber, Schema intermediateSchema + ) { + List keyFields = listFields( + intermediateSchema.getFields().get(INTERMEDIATE_TABLE_KEY_FIELD_NAME).getSubFields(), + INTERMEDIATE_TABLE_KEY_FIELD_NAME + "." + ); + + List valueColumns = valueColumns(intermediateSchema); + + final String key = INTERMEDIATE_TABLE_KEY_FIELD_NAME; + final String i = INTERMEDIATE_TABLE_ITERATION_FIELD_NAME; + final String value = INTERMEDIATE_TABLE_VALUE_FIELD_NAME; + final String batch = INTERMEDIATE_TABLE_BATCH_NUMBER_FIELD; + + return "MERGE " + table(destinationTable) + " " + + "USING (" + + "SELECT * FROM (" + + "SELECT ARRAY_AGG(" + + "x ORDER BY " + i + " DESC LIMIT 1" + + ")[OFFSET(0)] src " + + "FROM " + table(intermediateTable) + " x " + + "WHERE " + batch + "=" + batchNumber + " " + + "GROUP BY " + String.join(", ", keyFields) + + ")" + + ") " + + "ON `" + destinationTable.getTable() + "`." + keyFieldName + "=src." + key + " " + + "WHEN MATCHED " + + "THEN UPDATE SET " + valueColumns.stream().map(col -> "`" + col + "`=src." + value + "." + col).collect(Collectors.joining(", ")) + " " + + "WHEN NOT MATCHED " + + "THEN INSERT (`" + + keyFieldName + "`, " + + partitionTimePseudoColumn() + + "`" + + String.join("`, `", valueColumns) + "`) " + + "VALUES (" + + "src." + key + ", " + + partitionTimeValue() + + valueColumns.stream().map(col -> "src." + value + "." + col).collect(Collectors.joining(", ")) + + ");"; + } + + /* + Delete-only is the trickiest mode. Naively, we could just run a MERGE using the intermediate + table as a source and sort in ascending order of iteration. However, this would miss an edge + case where, for a given key, a non-tombstone record is sent and then followed by a tombstone, + and would result in all rows with that key being deleted from the table, followed by an + insertion of a row for the initial non-tombstone record. This is incorrect; any and all + records with a given key that precede a tombstone should either never make it into BigQuery or + be deleted once the tombstone record is merge flushed. + So instead, we have to try to filter out rows from the source (i.e., intermediate) table + that precede tombstone records for their keys. We do this by: + - Finding the latest tombstone row for each key in the current batch and extracting the + iteration number for each, referring to this as the "deletes" table + - Joining that with the current batch from the intermediate table on the row key, keeping + both tables' iteration numbers (a RIGHT JOIN is used so that rows whose keys don't have + any tombstones present are included with a NULL iteration number for the "deletes" table) + - Filtering out all rows where the "delete" table's iteration number is non-null, and their + iteration number is less than the "delete" table's iteration number + This gives us only rows from the most recent tombstone onward, and works in both cases where + the most recent row for a key is or is not a tombstone. + + MERGE ``.`` + USING ( + SELECT batch.key AS key, [partitionTime, ]value + FROM ( + SELECT src.i, src.key FROM ( + SELECT ARRAY_AGG( + x ORDER BY i DESC LIMIT 1 + )[OFFSET(0)] src + FROM ( + SELECT * FROM ``.`` + WHERE batchNumber= + ) x + WHERE x.value IS NULL + GROUP BY key.[, key....])) AS deletes + RIGHT JOIN ( + SELECT * FROM ``.` + ) AS batch + USING (key) + WHERE deletes.i IS NULL OR batch.i >= deletes.i + ORDER BY batch.i ASC) AS src + ON ``.=src.key AND src.value IS NULL + WHEN MATCHED + THEN DELETE + WHEN NOT MATCHED AND src.value IS NOT NULL + THEN INSERT (``, [_PARTITIONTIME, ]``[, ``]) + VALUES ( + src.key, + [CAST(CAST(DATE(src.partitionTime) AS DATE) AS TIMESTAMP),] + src.value.[, src.value....] + ); + */ + private String deleteMergeFlushQuery( + TableId intermediateTable, TableId destinationTable, int batchNumber, Schema intermediateSchema + ) { + List keyFields = listFields( + intermediateSchema.getFields().get(INTERMEDIATE_TABLE_KEY_FIELD_NAME).getSubFields(), + INTERMEDIATE_TABLE_KEY_FIELD_NAME + "." + ); + + List valueColumns = valueColumns(intermediateSchema); + + final String key = INTERMEDIATE_TABLE_KEY_FIELD_NAME; + final String i = INTERMEDIATE_TABLE_ITERATION_FIELD_NAME; + final String value = INTERMEDIATE_TABLE_VALUE_FIELD_NAME; + final String batch = INTERMEDIATE_TABLE_BATCH_NUMBER_FIELD; + + return "MERGE " + table(destinationTable) + " " + + "USING (" + + "SELECT batch." + key + " AS " + key + ", " + partitionTimeColumn() + value + " " + + "FROM (" + + "SELECT src." + i + ", src." + key + " FROM (" + + "SELECT ARRAY_AGG(" + + "x ORDER BY " + i + " DESC LIMIT 1" + + ")[OFFSET(0)] src " + + "FROM (" + + "SELECT * FROM " + table(intermediateTable) + " " + + "WHERE " + batch + "=" + batchNumber + + ") x " + + "WHERE x." + value + " IS NULL " + + "GROUP BY " + String.join(", ", keyFields) + ")) AS deletes " + + "RIGHT JOIN (" + + "SELECT * FROM " + table(intermediateTable) + " " + + "WHERE " + batch + "=" + batchNumber + + ") AS batch " + + "USING (" + key + ") " + + "WHERE deletes." + i + " IS NULL OR batch." + i + " >= deletes." + i + " " + + "ORDER BY batch." + i + " ASC) AS src " + + "ON `" + destinationTable.getTable() + "`." + keyFieldName + "=src." + key + " AND src." + value + " IS NULL " + + "WHEN MATCHED " + + "THEN DELETE " + + "WHEN NOT MATCHED AND src." + value + " IS NOT NULL " + + "THEN INSERT (`" + + keyFieldName + "`, " + + partitionTimePseudoColumn() + + "`" + + String.join("`, `", valueColumns) + "`) " + + "VALUES (" + + "src." + key + ", " + + partitionTimeValue() + + valueColumns.stream().map(col -> "src." + value + "." + col).collect(Collectors.joining(", ")) + + ");"; + } + + private String table(TableId tableId) { + return String.format("`%s`.`%s`", tableId.getDataset(), tableId.getTable()); + } + + private List valueColumns(Schema intermediateTableSchema) { + return intermediateTableSchema.getFields().get(INTERMEDIATE_TABLE_VALUE_FIELD_NAME).getSubFields() + .stream() + .map(Field::getName) + .collect(Collectors.toList()); + } + + private String partitionTimePseudoColumn() { + return insertPartitionTime ? "_PARTITIONTIME, " : ""; + } + + private String partitionTimeValue() { + return insertPartitionTime + ? "CAST(CAST(DATE(src." + INTERMEDIATE_TABLE_PARTITION_TIME_FIELD_NAME + ") AS DATE) AS TIMESTAMP), " + : ""; + } + + private String partitionTimeColumn() { + return insertPartitionTime + ? INTERMEDIATE_TABLE_PARTITION_TIME_FIELD_NAME + ", " + : ""; + } + + // DELETE FROM ``.`` WHERE batchNumber <= AND _PARTITIONTIME IS NOT NULL; + @VisibleForTesting + static String batchClearQuery(TableId intermediateTable, int batchNumber) { + return new StringBuilder("DELETE FROM `").append(intermediateTable.getDataset()).append("`.`").append(intermediateTable.getTable()).append("` ") + .append("WHERE ") + .append(INTERMEDIATE_TABLE_BATCH_NUMBER_FIELD).append(" <= ").append(batchNumber).append(" ") + // Use this clause to filter out rows that are still in the streaming buffer, which should + // not be subjected to UPDATE or DELETE operations or the query will FAIL + .append("AND _PARTITIONTIME IS NOT NULL") + .append(";") + .toString(); + } + + private static List listFields(FieldList keyFields, String prefix) { + return keyFields.stream() + .flatMap(field -> { + String fieldName = prefix + field.getName(); + FieldList subFields = field.getSubFields(); + if (subFields == null) { + return Stream.of(fieldName); + } + return listFields(subFields, fieldName + ".").stream(); + }).collect(Collectors.toList()); + } +} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/SchemaManager.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/SchemaManager.java index e10839b65..9af2bde59 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/SchemaManager.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/SchemaManager.java @@ -1,3 +1,22 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package com.wepay.kafka.connect.bigquery; @@ -5,36 +24,41 @@ import com.google.cloud.bigquery.BigQueryException; import com.google.cloud.bigquery.Clustering; import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.Field.Mode; import com.google.cloud.bigquery.LegacySQLTypeName; import com.google.cloud.bigquery.StandardTableDefinition; -import com.google.cloud.bigquery.Table; import com.google.cloud.bigquery.TableId; import com.google.cloud.bigquery.TableInfo; import com.google.cloud.bigquery.TimePartitioning; import com.google.cloud.bigquery.TimePartitioning.Type; +import com.google.common.annotations.VisibleForTesting; import com.wepay.kafka.connect.bigquery.api.SchemaRetriever; import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; import com.wepay.kafka.connect.bigquery.convert.KafkaDataBuilder; import com.wepay.kafka.connect.bigquery.convert.SchemaConverter; import com.wepay.kafka.connect.bigquery.exception.BigQueryConnectException; - +import com.wepay.kafka.connect.bigquery.utils.FieldNameSanitizer; +import com.wepay.kafka.connect.bigquery.utils.TableNameUtils; import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.sink.SinkRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; -import java.util.function.Function; -import java.util.stream.Collectors; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; + +import static com.google.common.base.Preconditions.checkState; /** * Class for managing Schemas of BigQuery tables (creating and updating). */ public class SchemaManager { + private static final Logger logger = LoggerFactory.getLogger(SchemaManager.class); private final SchemaRetriever schemaRetriever; @@ -42,10 +66,18 @@ public class SchemaManager { private final BigQuery bigQuery; private final boolean allowNewBQFields; private final boolean allowBQRequiredFieldRelaxation; + private final boolean allowSchemaUnionization; + private final boolean sanitizeFieldNames; private final Optional kafkaKeyFieldName; private final Optional kafkaDataFieldName; private final Optional timestampPartitionFieldName; + private final Optional partitionExpiration; private final Optional> clusteringFieldName; + private final Optional timePartitioningType; + private final boolean intermediateTables; + private final ConcurrentMap tableCreateLocks; + private final ConcurrentMap tableUpdateLocks; + private final ConcurrentMap schemaCache; /** * @param schemaRetriever Used to determine the Kafka Connect Schema that should be used for a @@ -54,10 +86,18 @@ public class SchemaManager { * @param bigQuery Used to communicate create/update requests to BigQuery. * @param allowNewBQFields If set to true, allows new fields to be added to BigQuery Schema. * @param allowBQRequiredFieldRelaxation If set to true, allows changing field mode from REQUIRED to NULLABLE + * @param allowSchemaUnionization If set to true, allows existing and new schemas to be unionized + * @param sanitizeFieldNames If true, sanitizes field names to adhere to BigQuery column name restrictions * @param kafkaKeyFieldName The name of kafka key field to be used in BigQuery. - * If set to null, Kafka Key Field will not be included in BigQuery. + * If set to null, Kafka Key Field will not be included in BigQuery. * @param kafkaDataFieldName The name of kafka data field to be used in BigQuery. * If set to null, Kafka Data Field will not be included in BigQuery. + * @param timestampPartitionFieldName The name of the field to use for column-based time + * partitioning in BigQuery. + * If set to null, ingestion time-based partitioning will be + * used instead. + * @param clusteringFieldName + * @param timePartitioningType The time partitioning type (HOUR, DAY, etc.) to use for created tables. */ public SchemaManager( SchemaRetriever schemaRetriever, @@ -65,29 +105,155 @@ public SchemaManager( BigQuery bigQuery, boolean allowNewBQFields, boolean allowBQRequiredFieldRelaxation, + boolean allowSchemaUnionization, + boolean sanitizeFieldNames, + Optional kafkaKeyFieldName, + Optional kafkaDataFieldName, + Optional timestampPartitionFieldName, + Optional partitionExpiration, + Optional> clusteringFieldName, + Optional timePartitioningType) { + this( + schemaRetriever, + schemaConverter, + bigQuery, + allowNewBQFields, + allowBQRequiredFieldRelaxation, + allowSchemaUnionization, + sanitizeFieldNames, + kafkaKeyFieldName, + kafkaDataFieldName, + timestampPartitionFieldName, + partitionExpiration, + clusteringFieldName, + timePartitioningType, + false, + new ConcurrentHashMap<>(), + new ConcurrentHashMap<>(), + new ConcurrentHashMap<>()); + } + + private SchemaManager( + SchemaRetriever schemaRetriever, + SchemaConverter schemaConverter, + BigQuery bigQuery, + boolean allowNewBQFields, + boolean allowBQRequiredFieldRelaxation, + boolean allowSchemaUnionization, + boolean sanitizeFieldNames, Optional kafkaKeyFieldName, Optional kafkaDataFieldName, Optional timestampPartitionFieldName, - Optional> clusteringFieldName) { + Optional partitionExpiration, + Optional> clusteringFieldName, + Optional timePartitioningType, + boolean intermediateTables, + ConcurrentMap tableCreateLocks, + ConcurrentMap tableUpdateLocks, + ConcurrentMap schemaCache) { this.schemaRetriever = schemaRetriever; this.schemaConverter = schemaConverter; this.bigQuery = bigQuery; this.allowNewBQFields = allowNewBQFields; this.allowBQRequiredFieldRelaxation = allowBQRequiredFieldRelaxation; + this.allowSchemaUnionization = allowSchemaUnionization; + this.sanitizeFieldNames = sanitizeFieldNames; this.kafkaKeyFieldName = kafkaKeyFieldName; this.kafkaDataFieldName = kafkaDataFieldName; this.timestampPartitionFieldName = timestampPartitionFieldName; + this.partitionExpiration = partitionExpiration; this.clusteringFieldName = clusteringFieldName; + this.timePartitioningType = timePartitioningType; + this.intermediateTables = intermediateTables; + this.tableCreateLocks = tableCreateLocks; + this.tableUpdateLocks = tableUpdateLocks; + this.schemaCache = schemaCache; + } + + public SchemaManager forIntermediateTables() { + return new SchemaManager( + schemaRetriever, + schemaConverter, + bigQuery, + allowNewBQFields, + allowBQRequiredFieldRelaxation, + allowSchemaUnionization, + sanitizeFieldNames, + kafkaKeyFieldName, + kafkaDataFieldName, + timestampPartitionFieldName, + partitionExpiration, + clusteringFieldName, + timePartitioningType, + true, + tableCreateLocks, + tableUpdateLocks, + schemaCache + ); + } + + /** + * Fetch the most recent schema for the given table, assuming it has been created and/or updated + * over the lifetime of this schema manager. + * @param table the table to fetch the schema for; may be null + * @return the latest schema for that table; may be null if the table does not exist or has not + * been created or updated by this schema manager + */ + public com.google.cloud.bigquery.Schema cachedSchema(TableId table) { + return schemaCache.get(table); + } + + /** + * Create a new table in BigQuery, if it doesn't already exist. Otherwise, update the existing + * table to use the most-current schema. + * @param table The BigQuery table to create, + * @param records The sink records used to determine the schema. + */ + public void createOrUpdateTable(TableId table, List records) { + synchronized (lock(tableCreateLocks, table)) { + if (bigQuery.getTable(table) == null) { + logger.debug("{} doesn't exist; creating instead of updating", table(table)); + if (createTable(table, records)) { + return; + } + } + } + + // Table already existed; attempt to update instead + logger.debug("{} already exists; updating instead of creating", table(table)); + updateSchema(table, records); } /** * Create a new table in BigQuery. * @param table The BigQuery table to create. * @param records The sink records used to determine the schema. + * @return whether the table had to be created; if the table already existed, will return false */ - public void createTable(TableId table, Set records) { - TableInfo tableInfo = getTableInfo(table, records); - bigQuery.create(tableInfo); + public boolean createTable(TableId table, List records) { + synchronized (lock(tableCreateLocks, table)) { + if (schemaCache.containsKey(table)) { + // Table already exists; noop + logger.debug("Skipping create of {} as it should already exist or appear very soon", table(table)); + return false; + } + TableInfo tableInfo = getTableInfo(table, records, true); + logger.info("Attempting to create {} with schema {}", + table(table), tableInfo.getDefinition().getSchema()); + try { + bigQuery.create(tableInfo); + logger.debug("Successfully created {}", table(table)); + schemaCache.put(table, tableInfo.getDefinition().getSchema()); + return true; + } catch (BigQueryException e) { + if (e.getCode() == 409) { + logger.debug("Failed to create {} as it already exists (possibly created by another task)", table(table)); + schemaCache.put(table, readTableSchema(table)); + return false; + } + throw e; + } + } } /** @@ -95,31 +261,71 @@ public void createTable(TableId table, Set records) { * @param table The BigQuery table to update. * @param records The sink records used to update the schema. */ - public void updateSchema(TableId table, Set records) { - TableInfo tableInfo = getTableInfo(table, records); - logger.info("Attempting to update table `{}` with schema {}", - table, tableInfo.getDefinition().getSchema()); - bigQuery.update(tableInfo); + public void updateSchema(TableId table, List records) { + synchronized (lock(tableUpdateLocks, table)) { + TableInfo tableInfo = getTableInfo(table, records, false); + if (!schemaCache.containsKey(table)) { + schemaCache.put(table, readTableSchema(table)); + } + + if (!schemaCache.get(table).equals(tableInfo.getDefinition().getSchema())) { + logger.info("Attempting to update {} with schema {}", + table(table), tableInfo.getDefinition().getSchema()); + bigQuery.update(tableInfo); + logger.debug("Successfully updated {}", table(table)); + schemaCache.put(table, tableInfo.getDefinition().getSchema()); + } else { + logger.debug("Skipping update of {} since current schema should be compatible", table(table)); + } + } } /** * Returns the {@link TableInfo} instance of a bigQuery Table * @param table The BigQuery table to return the table info * @param records The sink records used to determine the schema for constructing the table info + * @param createSchema Flag to determine if we are creating a new table schema or updating an existing table schema * @return The resulting BigQuery table information */ - private TableInfo getTableInfo(TableId table, Set records) { - List bigQuerySchemas = getSchemasList(table, records); - com.google.cloud.bigquery.Schema schema; + private TableInfo getTableInfo(TableId table, List records, Boolean createSchema) { + com.google.cloud.bigquery.Schema proposedSchema; String tableDescription; try { - schema = getUnionizedSchema(bigQuerySchemas); + proposedSchema = getAndValidateProposedSchema(table, records); tableDescription = getUnionizedTableDescription(records); } catch (BigQueryConnectException exception) { throw new BigQueryConnectException("Failed to unionize schemas of records for the table " + table, exception); } - TableInfo tableInfo = constructTableInfo(table, schema, tableDescription); - return tableInfo; + return constructTableInfo(table, proposedSchema, tableDescription, createSchema); + } + + @VisibleForTesting + com.google.cloud.bigquery.Schema getAndValidateProposedSchema( + TableId table, List records) { + com.google.cloud.bigquery.Schema result; + if (allowSchemaUnionization) { + List bigQuerySchemas = getSchemasList(table, records); + result = getUnionizedSchema(bigQuerySchemas); + } else { + com.google.cloud.bigquery.Schema existingSchema = readTableSchema(table); + SinkRecord recordToConvert = getRecordToConvert(records); + if (recordToConvert == null) { + String errorMessage = "Could not convert to BigQuery schema with a batch of tombstone records."; + if (existingSchema == null) { + throw new BigQueryConnectException(errorMessage); + } + logger.debug(errorMessage + " Will fall back to existing schema."); + return existingSchema; + } + result = convertRecordSchema(recordToConvert); + if (existingSchema != null) { + validateSchemaChange(existingSchema, result); + if (allowBQRequiredFieldRelaxation) { + result = relaxFieldsWhereNecessary(existingSchema, result); + } + } + } + return result; } /** @@ -128,21 +334,43 @@ private TableInfo getTableInfo(TableId table, Set records) { * @param records The sink records' schemas to add to the list of schemas * @return List of BigQuery schemas */ - private List getSchemasList(TableId table, Set records) { + private List getSchemasList(TableId table, List records) { List bigQuerySchemas = new ArrayList<>(); - if (bigQuery.getTable(table) != null) { - Table bigQueryTable = bigQuery.getTable(table.getDataset(), table.getTable()); - bigQuerySchemas.add(bigQueryTable.getDefinition().getSchema()); - } + Optional.ofNullable(readTableSchema(table)).ifPresent(bigQuerySchemas::add); for (SinkRecord record : records) { Schema kafkaValueSchema = schemaRetriever.retrieveValueSchema(record); - Schema kafkaKeySchema = kafkaKeyFieldName.isPresent() ? schemaRetriever.retrieveKeySchema(record) : null; - com.google.cloud.bigquery.Schema schema = getBigQuerySchema(kafkaKeySchema, kafkaValueSchema); - bigQuerySchemas.add(schema); + if (kafkaValueSchema == null) { + continue; + } + bigQuerySchemas.add(convertRecordSchema(record)); } return bigQuerySchemas; } + /** + * Gets a regular record from the given batch of SinkRecord for schema conversion. This is needed + * when delete is enabled, because a tombstone record has null value, thus null value schema. + * Converting null value schema to BigQuery schema is not possible. + * @param records List of SinkRecord to look for. + * @return a regular record or null if the whole batch are all tombstone records. + */ + private SinkRecord getRecordToConvert(List records) { + for (int i = records.size() - 1; i >= 0; i--) { + SinkRecord record = records.get(i); + if (schemaRetriever.retrieveValueSchema(record) != null) { + return record; + } + } + return null; + } + + private com.google.cloud.bigquery.Schema convertRecordSchema(SinkRecord record) { + Schema kafkaValueSchema = schemaRetriever.retrieveValueSchema(record); + Schema kafkaKeySchema = kafkaKeyFieldName.isPresent() ? schemaRetriever.retrieveKeySchema(record) : null; + com.google.cloud.bigquery.Schema result = getBigQuerySchema(kafkaKeySchema, kafkaValueSchema); + return result; + } + /** * Returns a unionized schema from a list of BigQuery schemas * @param schemas The list of BigQuery schemas to unionize @@ -150,49 +378,147 @@ private List getSchemasList(TableId table, Set */ private com.google.cloud.bigquery.Schema getUnionizedSchema(List schemas) { com.google.cloud.bigquery.Schema currentSchema = schemas.get(0); + com.google.cloud.bigquery.Schema proposedSchema; for (int i = 1; i < schemas.size(); i++) { - currentSchema = unionizeSchemas(currentSchema, schemas.get(i)); + proposedSchema = unionizeSchemas(currentSchema, schemas.get(i)); + validateSchemaChange(currentSchema, proposedSchema); + currentSchema = proposedSchema; } return currentSchema; } + private Field unionizeFields(Field firstField, Field secondField) { + if (secondField == null) { + if (!Field.Mode.REPEATED.equals(firstField.getMode())) { + return firstField.toBuilder().setMode(Field.Mode.NULLABLE).build(); + } else { + return firstField; + } + } + + checkState(firstField.getName().equals(secondField.getName()), + String.format("Cannot perform union operation on two fields having different names. " + + "Field names are '%s' and '%s'.", firstField.getName(), secondField.getName())); + checkState(firstField.getType() == secondField.getType(), + String.format("Cannot perform union operation on two fields having different datatypes. " + + "Field name is '%s' and datatypes are '%s' and '%s'.", firstField.getName(), firstField.getType(), secondField.getType())); + + Field.Builder retBuilder = firstField.toBuilder(); + if (isFieldRelaxation(firstField, secondField)) { + retBuilder.setMode(secondField.getMode()); + } + if (firstField.getType() == LegacySQLTypeName.RECORD) { + Map firstSubFields = subFields(firstField); + Map secondSubFields = subFields(secondField); + Map unionizedSubFields = new LinkedHashMap<>(); + + firstSubFields.forEach((name, firstSubField) -> { + Field secondSubField = secondSubFields.get(name); + unionizedSubFields.put(name, unionizeFields(firstSubField, secondSubField)); + }); + maybeAddToUnionizedFields(secondSubFields, unionizedSubFields); + retBuilder.setType(LegacySQLTypeName.RECORD, + unionizedSubFields.values().toArray(new Field[]{})); + } + return retBuilder.build(); + } + /** * Returns a single unionized BigQuery schema from two BigQuery schemas. * @param firstSchema The first BigQuery schema to unionize * @param secondSchema The second BigQuery schema to unionize * @return The resulting unionized BigQuery schema */ - private com.google.cloud.bigquery.Schema unionizeSchemas(com.google.cloud.bigquery.Schema firstSchema, com.google.cloud.bigquery.Schema secondSchema) { - Map firstSchemaFields = firstSchema - .getFields() - .stream() - .collect(Collectors.toMap(Field::getName, Function.identity())); - Map secondSchemaFields = secondSchema - .getFields() - .stream() - .collect(Collectors.toMap(Field::getName, Function.identity())); - for (Map.Entry entry : secondSchemaFields.entrySet()) { - if (!firstSchemaFields.containsKey(entry.getKey())) { - if (allowNewBQFields && (entry.getValue().getMode().equals(Field.Mode.NULLABLE) - || (entry.getValue().getMode().equals(Field.Mode.REQUIRED) && allowBQRequiredFieldRelaxation))) { - firstSchemaFields.put(entry.getKey(), entry.getValue().toBuilder().setMode(Field.Mode.NULLABLE).build()); + // VisibleForTesting + com.google.cloud.bigquery.Schema unionizeSchemas( + com.google.cloud.bigquery.Schema firstSchema, com.google.cloud.bigquery.Schema secondSchema) { + Map firstSchemaFields = schemaFields(firstSchema); + Map secondSchemaFields = schemaFields(secondSchema); + Map unionizedSchemaFields = new LinkedHashMap<>(); + + firstSchemaFields.forEach((name, firstField) -> { + Field secondField = secondSchemaFields.get(name); + if (secondField == null) { + // Repeated fields are implicitly nullable; no need to set a new mode for them + if (!Field.Mode.REPEATED.equals(firstField.getMode())) { + unionizedSchemaFields.put(name, firstField.toBuilder().setMode(Field.Mode.NULLABLE).build()); } else { - throw new BigQueryConnectException("New Field found with the name " + entry.getKey() - + " Ensure that " + BigQuerySinkConfig.ALLOW_NEW_BIGQUERY_FIELDS_CONFIG + " is true and " + BigQuerySinkConfig.ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG + - " is true if " + entry.getKey() + " has mode REQUIRED in order to update the Schema"); + unionizedSchemaFields.put(name, firstField); } } else { - if (firstSchemaFields.get(entry.getKey()).getMode().equals(Field.Mode.REQUIRED) && secondSchemaFields.get(entry.getKey()).getMode().equals(Field.Mode.NULLABLE)) { - if (allowBQRequiredFieldRelaxation) { - firstSchemaFields.put(entry.getKey(), entry.getValue().toBuilder().setMode(Field.Mode.NULLABLE).build()); - } else { - throw new BigQueryConnectException( entry.getKey() + " has mode REQUIRED. Set " + BigQuerySinkConfig.ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG - + " to true, to change the mode to NULLABLE"); - } + unionizedSchemaFields.put(name, unionizeFields(firstField, secondField)); + } + }); + + maybeAddToUnionizedFields(secondSchemaFields, unionizedSchemaFields); + return com.google.cloud.bigquery.Schema.of(unionizedSchemaFields.values()); + } + + private void maybeAddToUnionizedFields(Map secondSchemaFields, + Map unionizedFields) { + secondSchemaFields.forEach((name, secondField) -> { + if (!unionizedFields.containsKey(name)) { + if (Mode.REPEATED.equals(secondField.getMode())) { + // Repeated fields are implicitly nullable; no need to set a new mode for them + unionizedFields.put(name, secondField); + } else { + unionizedFields.put(name, secondField.toBuilder().setMode(Mode.NULLABLE).build()); + } + } + }); + } + + private void validateSchemaChange( + com.google.cloud.bigquery.Schema existingSchema, com.google.cloud.bigquery.Schema proposedSchema) { + logger.trace("Validating schema change. Existing schema: {}; proposed Schema: {}", + existingSchema.toString(), proposedSchema.toString()); + Map earliestSchemaFields = schemaFields(existingSchema); + Map proposedSchemaFields = schemaFields(proposedSchema); + + for (Map.Entry entry : proposedSchemaFields.entrySet()) { + if (!earliestSchemaFields.containsKey(entry.getKey())) { + if (!isValidFieldAddition(entry.getValue())) { + throw new BigQueryConnectException("New Field found with the name " + entry.getKey() + + " Ensure that " + BigQuerySinkConfig.ALLOW_NEW_BIGQUERY_FIELDS_CONFIG + " is true and " + + BigQuerySinkConfig.ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG + + " is true if " + entry.getKey() + " has mode REQUIRED in order to update the Schema"); + } + } else if (isFieldRelaxation(earliestSchemaFields.get(entry.getKey()), entry.getValue())) { + if (!allowBQRequiredFieldRelaxation) { + throw new BigQueryConnectException( entry.getKey() + " has mode REQUIRED. Set " + + BigQuerySinkConfig.ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG + + " to true, to change the mode to NULLABLE"); } } } - return com.google.cloud.bigquery.Schema.of(firstSchemaFields.values()); + } + + private boolean isFieldRelaxation(Field currentField, Field proposedField) { + return currentField.getMode().equals(Field.Mode.REQUIRED) + && proposedField.getMode().equals(Field.Mode.NULLABLE); + } + + private boolean isValidFieldAddition(Field newField) { + return allowNewBQFields && ( + newField.getMode().equals(Field.Mode.NULLABLE) || + newField.getMode().equals(Field.Mode.REPEATED) || + (newField.getMode().equals(Field.Mode.REQUIRED) && allowBQRequiredFieldRelaxation)); + } + + private com.google.cloud.bigquery.Schema relaxFieldsWhereNecessary( + com.google.cloud.bigquery.Schema existingSchema, + com.google.cloud.bigquery.Schema proposedSchema) { + Map existingSchemaFields = schemaFields(existingSchema); + Map proposedSchemaFields = schemaFields(proposedSchema); + List newSchemaFields = new ArrayList<>(); + for (Map.Entry entry : proposedSchemaFields.entrySet()) { + if (!existingSchemaFields.containsKey(entry.getKey()) && !Field.Mode.REPEATED.equals(entry.getValue().getMode())) { + newSchemaFields.add(entry.getValue().toBuilder().setMode(Field.Mode.NULLABLE).build()); + } else { + newSchemaFields.add(entry.getValue()); + } + } + return com.google.cloud.bigquery.Schema.of(newSchemaFields); } /** @@ -200,57 +526,189 @@ private com.google.cloud.bigquery.Schema unionizeSchemas(com.google.cloud.bigque * @param records The records used to get the unionized table description * @return The resulting table description */ - private String getUnionizedTableDescription(Set records) { + @VisibleForTesting + String getUnionizedTableDescription(List records) { String tableDescription = null; for (SinkRecord record : records) { Schema kafkaValueSchema = schemaRetriever.retrieveValueSchema(record); + if (kafkaValueSchema == null) { + continue; + } tableDescription = kafkaValueSchema.doc() != null ? kafkaValueSchema.doc() : tableDescription; } return tableDescription; } - // package private for testing. - TableInfo constructTableInfo(TableId table, com.google.cloud.bigquery.Schema bigQuerySchema, String tableDescription) { - TimePartitioning timePartitioning = TimePartitioning.of(Type.DAY); - if (timestampPartitionFieldName.isPresent()) { - timePartitioning = timePartitioning.toBuilder().setField(timestampPartitionFieldName.get()).build(); + private Map subFields(Field parent) { + Map result = new LinkedHashMap<>(); + if (parent == null || parent.getSubFields() == null) { + return result; } + parent.getSubFields().forEach(field -> { + if (field.getMode() == null) { + field = field.toBuilder().setMode(Mode.NULLABLE).build(); + } + result.put(field.getName(), field); + }); + return result; + } + + /** + * Returns a dictionary providing lookup of each field in the schema by name. The ordering of the + * fields in the schema is preserved in the returned map. + * @param schema The BigQuery schema + * @return A map allowing lookup of schema fields by name + */ + private Map schemaFields(com.google.cloud.bigquery.Schema schema) { + Map result = new LinkedHashMap<>(); + schema.getFields().forEach(field -> { + if (field.getMode() == null) { + field = field.toBuilder().setMode(Field.Mode.NULLABLE).build(); + } + result.put(field.getName(), field); + }); + return result; + } + // package private for testing. + TableInfo constructTableInfo(TableId table, com.google.cloud.bigquery.Schema bigQuerySchema, String tableDescription, + Boolean createSchema) { StandardTableDefinition.Builder builder = StandardTableDefinition.newBuilder() - .setSchema(bigQuerySchema) - .setTimePartitioning(timePartitioning); - - if (timestampPartitionFieldName.isPresent() && clusteringFieldName.isPresent()) { - Clustering clustering = Clustering.newBuilder() - .setFields(clusteringFieldName.get()) - .build(); - builder.setClustering(clustering); + .setSchema(bigQuerySchema); + + if (intermediateTables) { + // Shameful hack: make the table ingestion time-partitioned here so that the _PARTITIONTIME + // pseudocolumn can be queried to filter out rows that are still in the streaming buffer + builder.setTimePartitioning(TimePartitioning.of(Type.DAY)); + } else if (createSchema) { + timePartitioningType.ifPresent(partitioningType -> { + TimePartitioning.Builder timePartitioningBuilder = TimePartitioning.of(partitioningType).toBuilder(); + timestampPartitionFieldName.ifPresent(timePartitioningBuilder::setField); + partitionExpiration.ifPresent(timePartitioningBuilder::setExpirationMs); + + builder.setTimePartitioning(timePartitioningBuilder.build()); + + if (timestampPartitionFieldName.isPresent() && clusteringFieldName.isPresent()) { + Clustering clustering = Clustering.newBuilder() + .setFields(clusteringFieldName.get()) + .build(); + builder.setClustering(clustering); + } + }); } StandardTableDefinition tableDefinition = builder.build(); TableInfo.Builder tableInfoBuilder = TableInfo.newBuilder(table, tableDefinition); - if (tableDescription != null) { + if (intermediateTables) { + tableInfoBuilder.setDescription("Temporary table"); + } else if (tableDescription != null) { tableInfoBuilder.setDescription(tableDescription); } + return tableInfoBuilder.build(); } private com.google.cloud.bigquery.Schema getBigQuerySchema(Schema kafkaKeySchema, Schema kafkaValueSchema) { - List allFields = new ArrayList<> (); com.google.cloud.bigquery.Schema valueSchema = schemaConverter.convertSchema(kafkaValueSchema); - allFields.addAll(valueSchema.getFields()); - if (kafkaKeyFieldName.isPresent()) { - com.google.cloud.bigquery.Schema keySchema = schemaConverter.convertSchema(kafkaKeySchema); - Field kafkaKeyField = Field.newBuilder(kafkaKeyFieldName.get(), LegacySQLTypeName.RECORD, keySchema.getFields()) - .setMode(Field.Mode.NULLABLE).build(); - allFields.add(kafkaKeyField); + + List schemaFields = intermediateTables + ? getIntermediateSchemaFields(valueSchema, kafkaKeySchema) + : getRegularSchemaFields(valueSchema, kafkaKeySchema); + + return com.google.cloud.bigquery.Schema.of(schemaFields); + } + + private List getIntermediateSchemaFields(com.google.cloud.bigquery.Schema valueSchema, Schema kafkaKeySchema) { + if (kafkaKeySchema == null) { + throw new BigQueryConnectException(String.format( + "Cannot create intermediate table without specifying a value for '%s'", + BigQuerySinkConfig.KAFKA_KEY_FIELD_NAME_CONFIG + )); + } + + List result = new ArrayList<>(); + + List valueFields = new ArrayList<>(valueSchema.getFields()); + if (kafkaDataFieldName.isPresent()) { + String dataFieldName = sanitizeFieldNames ? + FieldNameSanitizer.sanitizeName(kafkaDataFieldName.get()) : kafkaDataFieldName.get(); + Field kafkaDataField = KafkaDataBuilder.buildKafkaDataField(dataFieldName); + valueFields.add(kafkaDataField); } + + // Wrap the sink record value (and possibly also its Kafka data) in a struct in order to support deletes + Field wrappedValueField = Field + .newBuilder(MergeQueries.INTERMEDIATE_TABLE_VALUE_FIELD_NAME, LegacySQLTypeName.RECORD, valueFields.toArray(new Field[0])) + .setMode(Field.Mode.NULLABLE) + .build(); + result.add(wrappedValueField); + + com.google.cloud.bigquery.Schema keySchema = schemaConverter.convertSchema(kafkaKeySchema); + Field kafkaKeyField = Field.newBuilder(MergeQueries.INTERMEDIATE_TABLE_KEY_FIELD_NAME, LegacySQLTypeName.RECORD, keySchema.getFields()) + .setMode(Field.Mode.REQUIRED) + .build(); + result.add(kafkaKeyField); + + Field iterationField = Field + .newBuilder(MergeQueries.INTERMEDIATE_TABLE_ITERATION_FIELD_NAME, LegacySQLTypeName.INTEGER) + .setMode(Field.Mode.REQUIRED) + .build(); + result.add(iterationField); + + Field partitionTimeField = Field + .newBuilder(MergeQueries.INTERMEDIATE_TABLE_PARTITION_TIME_FIELD_NAME, LegacySQLTypeName.TIMESTAMP) + .setMode(Field.Mode.NULLABLE) + .build(); + result.add(partitionTimeField); + + Field batchNumberField = Field + .newBuilder(MergeQueries.INTERMEDIATE_TABLE_BATCH_NUMBER_FIELD, LegacySQLTypeName.INTEGER) + .setMode(Field.Mode.REQUIRED) + .build(); + result.add(batchNumberField); + + return result; + } + + private List getRegularSchemaFields(com.google.cloud.bigquery.Schema valueSchema, Schema kafkaKeySchema) { + List result = new ArrayList<>(valueSchema.getFields()); + if (kafkaDataFieldName.isPresent()) { - Field kafkaDataField = KafkaDataBuilder.buildKafkaDataField(kafkaDataFieldName.get()); - allFields.add(kafkaDataField); + String dataFieldName = sanitizeFieldNames ? + FieldNameSanitizer.sanitizeName(kafkaDataFieldName.get()) : kafkaDataFieldName.get(); + Field kafkaDataField = KafkaDataBuilder.buildKafkaDataField(dataFieldName); + result.add(kafkaDataField); + } + + if (kafkaKeyFieldName.isPresent()) { + com.google.cloud.bigquery.Schema keySchema = schemaConverter.convertSchema(kafkaKeySchema); + String keyFieldName = sanitizeFieldNames ? + FieldNameSanitizer.sanitizeName(kafkaKeyFieldName.get()) : kafkaKeyFieldName.get(); + Field kafkaKeyField = Field.newBuilder( + keyFieldName, + LegacySQLTypeName.RECORD, + keySchema.getFields()).setMode(Field.Mode.NULLABLE).build(); + result.add(kafkaKeyField); } - return com.google.cloud.bigquery.Schema.of(allFields); + + return result; + } + + private String table(TableId table) { + return intermediateTables + ? TableNameUtils.intTable(table) + : TableNameUtils.table(table); + } + + private com.google.cloud.bigquery.Schema readTableSchema(TableId table) { + logger.trace("Reading schema for {}", table(table)); + return Optional.ofNullable(bigQuery.getTable(table)) + .map(t -> t.getDefinition().getSchema()) + .orElse(null); } + private Object lock(ConcurrentMap locks, TableId table) { + return locks.computeIfAbsent(table, t -> new Object()); + } } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkConfig.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkConfig.java index e1ab964c8..2c6ca01f9 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkConfig.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkConfig.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.config; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,43 +17,39 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.config; import com.google.cloud.bigquery.Schema; - +import com.google.cloud.bigquery.TimePartitioning; +import com.wepay.kafka.connect.bigquery.GcpClientBuilder; import com.wepay.kafka.connect.bigquery.api.SchemaRetriever; - import com.wepay.kafka.connect.bigquery.convert.BigQueryRecordConverter; import com.wepay.kafka.connect.bigquery.convert.BigQuerySchemaConverter; import com.wepay.kafka.connect.bigquery.convert.RecordConverter; import com.wepay.kafka.connect.bigquery.convert.SchemaConverter; - +import com.wepay.kafka.connect.bigquery.retrieve.IdentitySchemaRetriever; +import com.wepay.kafka.connect.bigquery.utils.FieldNameSanitizer; import org.apache.kafka.common.config.AbstractConfig; -import org.apache.kafka.common.config.types.Password; +import org.apache.kafka.common.config.Config; import org.apache.kafka.common.config.ConfigDef; import org.apache.kafka.common.config.ConfigException; - +import org.apache.kafka.common.config.ConfigValue; +import org.apache.kafka.common.config.types.Password; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.sink.SinkConnector; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; -import java.util.AbstractMap; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; +import java.util.*; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Base class for connector and task configs; contains properties shared between the two of them. */ public class BigQuerySinkConfig extends AbstractConfig { - private static final ConfigDef config; - private static final Logger logger = LoggerFactory.getLogger(BigQuerySinkConfig.class); - // Values taken from https://github.com/apache/kafka/blob/1.1.1/connect/runtime/src/main/java/org/apache/kafka/connect/runtime/SinkConnectorConfig.java#L33 public static final String TOPICS_CONFIG = SinkConnector.TOPICS_CONFIG; private static final ConfigDef.Type TOPICS_TYPE = ConfigDef.Type.LIST; @@ -104,7 +100,7 @@ public class BigQuerySinkConfig extends AbstractConfig { public static final String GCS_FOLDER_NAME_CONFIG = "gcsFolderName"; private static final ConfigDef.Type GCS_FOLDER_NAME_TYPE = ConfigDef.Type.STRING; - private static final Object GCS_FOLDER_NAME_DEFAULT = ""; + public static final String GCS_FOLDER_NAME_DEFAULT = ""; private static final ConfigDef.Importance GCS_FOLDER_NAME_IMPORTANCE = ConfigDef.Importance.MEDIUM; private static final String GCS_FOLDER_NAME_DOC = "The name of the folder under the bucket in which gcs blobs used to batch load to BigQuery " @@ -124,7 +120,7 @@ public class BigQuerySinkConfig extends AbstractConfig { public static final String SCHEMA_RETRIEVER_CONFIG = "schemaRetriever"; private static final ConfigDef.Type SCHEMA_RETRIEVER_TYPE = ConfigDef.Type.CLASS; - private static final Class SCHEMA_RETRIEVER_DEFAULT = null; + private static final Class SCHEMA_RETRIEVER_DEFAULT = IdentitySchemaRetriever.class; private static final ConfigDef.Importance SCHEMA_RETRIEVER_IMPORTANCE = ConfigDef.Importance.MEDIUM; private static final String SCHEMA_RETRIEVER_DOC = @@ -139,12 +135,20 @@ public class BigQuerySinkConfig extends AbstractConfig { public static final String KEY_SOURCE_CONFIG = "keySource"; private static final ConfigDef.Type KEY_SOURCE_TYPE = ConfigDef.Type.STRING; - public static final String KEY_SOURCE_DEFAULT = "FILE"; - private static final ConfigDef.Validator KEY_SOURCE_VALIDATOR = - ConfigDef.ValidString.in("FILE", "JSON"); + public static final String KEY_SOURCE_DEFAULT = GcpClientBuilder.KeySource.FILE.name(); + private static final ConfigDef.Validator KEY_SOURCE_VALIDATOR = ConfigDef.ValidString.in( + Stream.of(GcpClientBuilder.KeySource.values()) + .map(GcpClientBuilder.KeySource::name) + .collect(Collectors.toList()) + .toArray(new String[0]) + ); private static final ConfigDef.Importance KEY_SOURCE_IMPORTANCE = ConfigDef.Importance.MEDIUM; private static final String KEY_SOURCE_DOC = - "Determines whether the keyfile config is the path to the credentials json, or the json itself"; + "Determines whether the " + KEYFILE_CONFIG + " config is the path to the credentials json file " + + "or the raw json of the key itself. " + + "If set to " + GcpClientBuilder.KeySource.APPLICATION_DEFAULT.name() + ", the " + + KEYFILE_CONFIG + " should not be provided and the connector will use any GCP " + + "application default credentials that it can find on the Connect worker for authentication."; public static final String SANITIZE_TOPICS_CONFIG = "sanitizeTopics"; private static final ConfigDef.Type SANITIZE_TOPICS_TYPE = ConfigDef.Type.BOOLEAN; @@ -155,6 +159,73 @@ public class BigQuerySinkConfig extends AbstractConfig { "Whether to automatically sanitize topic names before using them as table names;" + " if not enabled topic names will be used directly as table names"; + public static final String TOPIC2TABLE_MAP_CONFIG = "topic2TableMap"; + private static final ConfigDef.Type TOPIC2TABLE_MAP_TYPE = ConfigDef.Type.STRING; + public static final String TOPIC2TABLE_MAP_DEFAULT = ""; + private static final ConfigDef.Importance TOPIC2TABLE_MAP_IMPORTANCE = ConfigDef.Importance.LOW; + public static final String TOPIC2TABLE_MAP_DOC = "Map of topics to tables (optional). " + + "Format: comma-separated tuples, e.g. :,:,... " + + "Note that topic name should not be modified using regex SMT while using this option." + + "Also note that SANITIZE_TOPICS_CONFIG would be ignored if this config is set." + + "Lastly, if the topic2table map doesn't contain the topic for a record, a table" + + " with the same name as the topic name would be created"; + private static final ConfigDef.Validator TOPIC2TABLE_MAP_VALIDATOR = (name, value) -> { + String topic2TableMapString = (String) ConfigDef.parseType(name, value, TOPIC2TABLE_MAP_TYPE); + + if (topic2TableMapString.isEmpty()) { + return; + } + + Map topic2TableMap = new HashMap<>(); + + for (String str : topic2TableMapString.split(",")) { + String[] tt = str.split(":"); + + if (tt.length != 2) { + throw new ConfigException( + name, + topic2TableMapString, + "One of the topic to table mappings has an invalid format." + ); + } + + String topic = tt[0].trim(); + String table = tt[1].trim(); + + if (topic.isEmpty() || table.isEmpty()) { + throw new ConfigException( + name, + topic2TableMapString, + "One of the topic to table mappings has an invalid format." + ); + } + + if (topic2TableMap.containsKey(topic)) { + throw new ConfigException( + name, + name, + String.format( + "The topic name %s is duplicated. Topic names cannot be duplicated.", + topic + ) + ); + } + + if (topic2TableMap.containsValue(table)) { + throw new ConfigException( + name, + topic2TableMapString, + String.format( + "The table name %s is duplicated. Table names cannot be duplicated.", + table + ) + ); + } + topic2TableMap.put(topic, table); + } + }; + + public static final String SANITIZE_FIELD_NAME_CONFIG = "sanitizeFieldNames"; private static final ConfigDef.Type SANITIZE_FIELD_NAME_TYPE = ConfigDef.Type.BOOLEAN; public static final Boolean SANITIZE_FIELD_NAME_DEFAULT = false; @@ -171,6 +242,7 @@ public class BigQuerySinkConfig extends AbstractConfig { public static final String KAFKA_KEY_FIELD_NAME_CONFIG = "kafkaKeyFieldName"; private static final ConfigDef.Type KAFKA_KEY_FIELD_NAME_TYPE = ConfigDef.Type.STRING; public static final String KAFKA_KEY_FIELD_NAME_DEFAULT = null; + private static final ConfigDef.Validator KAFKA_KEY_FIELD_NAME_VALIDATOR = new ConfigDef.NonEmptyString(); private static final ConfigDef.Importance KAFKA_KEY_FIELD_NAME_IMPORTANCE = ConfigDef.Importance.LOW; private static final String KAFKA_KEY_FIELD_NAME_DOC = "The name of the field of Kafka key. " + "Default to be null, which means Kafka Key Field will not be included."; @@ -178,6 +250,7 @@ public class BigQuerySinkConfig extends AbstractConfig { public static final String KAFKA_DATA_FIELD_NAME_CONFIG = "kafkaDataFieldName"; private static final ConfigDef.Type KAFKA_DATA_FIELD_NAME_TYPE = ConfigDef.Type.STRING; public static final String KAFKA_DATA_FIELD_NAME_DEFAULT = null; + private static final ConfigDef.Validator KAFKA_DATA_FIELD_NAME_VALIDATOR = new ConfigDef.NonEmptyString(); private static final ConfigDef.Importance KAFKA_DATA_FIELD_NAME_IMPORTANCE = ConfigDef.Importance.LOW; private static final String KAFKA_DATA_FIELD_NAME_DOC = "The name of the field of Kafka Data. " + "Default to be null, which means Kafka Data Field will not be included. "; @@ -217,30 +290,254 @@ public class BigQuerySinkConfig extends AbstractConfig { private static final String TABLE_CREATE_DOC = "Automatically create BigQuery tables if they don't already exist"; - public static final String AUTO_CREATE_BUCKET_CONFIG = "autoCreateBucket"; - private static final ConfigDef.Type AUTO_CREATE_BUCKET_TYPE = ConfigDef.Type.BOOLEAN; - public static final Boolean AUTO_CREATE_BUCKET_DEFAULT = true; + public static final String AUTO_CREATE_BUCKET_CONFIG = "autoCreateBucket"; + private static final ConfigDef.Type AUTO_CREATE_BUCKET_TYPE = ConfigDef.Type.BOOLEAN; + public static final Boolean AUTO_CREATE_BUCKET_DEFAULT = true; private static final ConfigDef.Importance AUTO_CREATE_BUCKET_IMPORTANCE = ConfigDef.Importance.MEDIUM; private static final String AUTO_CREATE_BUCKET_DOC = "Whether to automatically create the given bucket, if it does not exist. " + "Only relevant if enableBatchLoad is configured."; - public static final String ALLOW_NEW_BIGQUERY_FIELDS_CONFIG = "allowNewBigQueryFields"; - private static final ConfigDef.Type ALLOW_NEW_BIGQUERY_FIELDS_TYPE = ConfigDef.Type.BOOLEAN; - public static final Boolean ALLOW_NEW_BIGQUERY_FIELDS_DEFAULT = false; + public static final String ALLOW_NEW_BIGQUERY_FIELDS_CONFIG = "allowNewBigQueryFields"; + private static final ConfigDef.Type ALLOW_NEW_BIGQUERY_FIELDS_TYPE = ConfigDef.Type.BOOLEAN; + public static final Boolean ALLOW_NEW_BIGQUERY_FIELDS_DEFAULT = false; private static final ConfigDef.Importance ALLOW_NEW_BIGQUERY_FIELDS_IMPORTANCE = ConfigDef.Importance.MEDIUM; private static final String ALLOW_NEW_BIGQUERY_FIELDS_DOC = "If true, new fields can be added to BigQuery tables during subsequent schema updates"; - public static final String ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG = "allowBigQueryRequiredFieldRelaxation"; - private static final ConfigDef.Type ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_TYPE = ConfigDef.Type.BOOLEAN; - public static final Boolean ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_DEFAULT = false; + public static final String ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG = "allowBigQueryRequiredFieldRelaxation"; + private static final ConfigDef.Type ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_TYPE = ConfigDef.Type.BOOLEAN; + public static final Boolean ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_DEFAULT = false; private static final ConfigDef.Importance ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_IMPORTANCE = ConfigDef.Importance.MEDIUM; private static final String ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_DOC = "If true, fields in BigQuery Schema can be changed from REQUIRED to NULLABLE"; - static { - config = new ConfigDef() + public static final String ALLOW_SCHEMA_UNIONIZATION_CONFIG = "allowSchemaUnionization"; + private static final ConfigDef.Type ALLOW_SCHEMA_UNIONIZATION_TYPE = ConfigDef.Type.BOOLEAN; + public static final Boolean ALLOW_SCHEMA_UNIONIZATION_DEFAULT = false; + private static final ConfigDef.Importance ALLOW_SCHEMA_UNIONIZATION_IMPORTANCE = ConfigDef.Importance.MEDIUM; + private static final String ALLOW_SCHEMA_UNIONIZATION_DOC = + "If true, the existing table schema (if one is present) will be unionized with new " + + "record schemas during schema updates"; + + public static final String UPSERT_ENABLED_CONFIG = "upsertEnabled"; + private static final ConfigDef.Type UPSERT_ENABLED_TYPE = ConfigDef.Type.BOOLEAN; + public static final boolean UPSERT_ENABLED_DEFAULT = false; + private static final ConfigDef.Importance UPSERT_ENABLED_IMPORTANCE = ConfigDef.Importance.LOW; + private static final String UPSERT_ENABLED_DOC = + "Enable upsert functionality on the connector through the use of record keys, intermediate " + + "tables, and periodic merge flushes. Row-matching will be performed based on the contents " + + "of record keys."; + + public static final String DELETE_ENABLED_CONFIG = "deleteEnabled"; + private static final ConfigDef.Type DELETE_ENABLED_TYPE = ConfigDef.Type.BOOLEAN; + public static final boolean DELETE_ENABLED_DEFAULT = false; + private static final ConfigDef.Importance DELETE_ENABLED_IMPORTANCE = ConfigDef.Importance.LOW; + private static final String DELETE_ENABLED_DOC = + "Enable delete functionality on the connector through the use of record keys, intermediate " + + "tables, and periodic merge flushes. A delete will be performed when a record with a null " + + "value (i.e., a tombstone record) is read."; + + public static final String INTERMEDIATE_TABLE_SUFFIX_CONFIG = "intermediateTableSuffix"; + private static final ConfigDef.Type INTERMEDIATE_TABLE_SUFFIX_TYPE = ConfigDef.Type.STRING; + public static final String INTERMEDIATE_TABLE_SUFFIX_DEFAULT = "tmp"; + private static final ConfigDef.Validator INTERMEDIATE_TABLE_SUFFIX_VALIDATOR = new ConfigDef.NonEmptyString(); + private static final ConfigDef.Importance INTERMEDIATE_TABLE_SUFFIX_IMPORTANCE = ConfigDef.Importance.LOW; + private static final String INTERMEDIATE_TABLE_SUFFIX_DOC = + "A suffix that will be appended to the names of destination tables to create the names for " + + "the corresponding intermediate tables. Multiple intermediate tables may be created for a " + + "single destination table, but their names will always start with the name of the " + + "destination table, followed by this suffix, and possibly followed by an additional " + + "suffix."; + + public static final String MERGE_INTERVAL_MS_CONFIG = "mergeIntervalMs"; + private static final ConfigDef.Type MERGE_INTERVAL_MS_TYPE = ConfigDef.Type.LONG; + public static final long MERGE_INTERVAL_MS_DEFAULT = 60_000L; + private static final ConfigDef.Validator MERGE_INTERVAL_MS_VALIDATOR = ConfigDef.LambdaValidator.with( + (name, value) -> { + if (value == null) { + return; + } + long parsedValue = (long) ConfigDef.parseType(name, value, MERGE_INTERVAL_MS_TYPE); + + if (parsedValue == 0) { + throw new ConfigException(name, value, "Cannot be zero"); + } else if (parsedValue < -1) { + throw new ConfigException(name, value, "Cannot be less than -1"); + } + }, + () -> "Either a positive integer or -1 to disable time interval-based merging" + ); + private static final ConfigDef.Importance MERGE_INTERVAL_MS_IMPORTANCE = ConfigDef.Importance.LOW; + private static final String MERGE_INTERVAL_MS_DOC = + "How often (in milliseconds) to perform a merge flush, if upsert/delete is enabled. Can be " + + "set to -1 to disable periodic flushing."; + + public static final String MERGE_RECORDS_THRESHOLD_CONFIG = "mergeRecordsThreshold"; + private static final ConfigDef.Type MERGE_RECORDS_THRESHOLD_TYPE = ConfigDef.Type.LONG; + public static final long MERGE_RECORDS_THRESHOLD_DEFAULT = -1; + private static final ConfigDef.Validator MERGE_RECORDS_THRESHOLD_VALIDATOR = ConfigDef.LambdaValidator.with( + (name, value) -> { + if (value == null) { + return; + } + long parsedValue = (long) ConfigDef.parseType(name, value, MERGE_RECORDS_THRESHOLD_TYPE); + + if (parsedValue == 0) { + throw new ConfigException(name, value, "Cannot be zero"); + } else if (parsedValue < -1) { + throw new ConfigException(name, value, "Cannot be less than -1"); + } + }, + () -> "Either a positive integer or -1 to disable throughput-based merging" + ); + private static final ConfigDef.Importance MERGE_RECORDS_THRESHOLD_IMPORTANCE = ConfigDef.Importance.LOW; + private static final String MERGE_RECORDS_THRESHOLD_DOC = + "How many records to write to an intermediate table before performing a merge flush, if " + + "upsert/delete is enabled. Can be set to -1 to disable record count-based flushing."; + + public static final String THREAD_POOL_SIZE_CONFIG = "threadPoolSize"; + private static final ConfigDef.Type THREAD_POOL_SIZE_TYPE = ConfigDef.Type.INT; + public static final Integer THREAD_POOL_SIZE_DEFAULT = 10; + private static final ConfigDef.Validator THREAD_POOL_SIZE_VALIDATOR = ConfigDef.Range.atLeast(1); + private static final ConfigDef.Importance THREAD_POOL_SIZE_IMPORTANCE = + ConfigDef.Importance.MEDIUM; + private static final String THREAD_POOL_SIZE_DOC = + "The size of the BigQuery write thread pool. This establishes the maximum number of " + + "concurrent writes to BigQuery."; + + public static final String QUEUE_SIZE_CONFIG = "queueSize"; + private static final ConfigDef.Type QUEUE_SIZE_TYPE = ConfigDef.Type.LONG; + // should this even have a default? + public static final Long QUEUE_SIZE_DEFAULT = -1L; + private static final ConfigDef.Validator QUEUE_SIZE_VALIDATOR = ConfigDef.Range.atLeast(-1); + private static final ConfigDef.Importance QUEUE_SIZE_IMPORTANCE = ConfigDef.Importance.HIGH; + private static final String QUEUE_SIZE_DOC = + "The maximum size (or -1 for no maximum size) of the worker queue for bigQuery write " + + "requests before all topics are paused. This is a soft limit; the size of the queue can " + + "go over this before topics are paused. All topics will be resumed once a flush is " + + "requested or the size of the queue drops under half of the maximum size."; + + public static final String BIGQUERY_RETRY_CONFIG = "bigQueryRetry"; + private static final ConfigDef.Type BIGQUERY_RETRY_TYPE = ConfigDef.Type.INT; + public static final Integer BIGQUERY_RETRY_DEFAULT = 0; + private static final ConfigDef.Validator BIGQUERY_RETRY_VALIDATOR = ConfigDef.Range.atLeast(0); + private static final ConfigDef.Importance BIGQUERY_RETRY_IMPORTANCE = + ConfigDef.Importance.MEDIUM; + private static final String BIGQUERY_RETRY_DOC = + "The number of retry attempts that will be made per BigQuery request that fails with a " + + "backend error or a quota exceeded error"; + + public static final String BIGQUERY_RETRY_WAIT_CONFIG = "bigQueryRetryWait"; + private static final ConfigDef.Type BIGQUERY_RETRY_WAIT_CONFIG_TYPE = ConfigDef.Type.LONG; + public static final Long BIGQUERY_RETRY_WAIT_DEFAULT = 1000L; + private static final ConfigDef.Validator BIGQUERY_RETRY_WAIT_VALIDATOR = + ConfigDef.Range.atLeast(0); + private static final ConfigDef.Importance BIGQUERY_RETRY_WAIT_IMPORTANCE = + ConfigDef.Importance.MEDIUM; + private static final String BIGQUERY_RETRY_WAIT_DOC = + "The minimum amount of time, in milliseconds, to wait between BigQuery backend or quota " + + "exceeded error retry attempts."; + + public static final String BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG = + "bigQueryMessageTimePartitioning"; + private static final ConfigDef.Type BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG_TYPE = + ConfigDef.Type.BOOLEAN; + public static final Boolean BIGQUERY_MESSAGE_TIME_PARTITIONING_DEFAULT = false; + private static final ConfigDef.Importance BIGQUERY_MESSAGE_TIME_PARTITIONING_IMPORTANCE = + ConfigDef.Importance.HIGH; + private static final String BIGQUERY_MESSAGE_TIME_PARTITIONING_DOC = + "Whether or not to use the message time when inserting records. " + + "Default uses the connector processing time."; + + public static final String BIGQUERY_PARTITION_DECORATOR_CONFIG = + "bigQueryPartitionDecorator"; + private static final ConfigDef.Type BIGQUERY_PARTITION_DECORATOR_CONFIG_TYPE = + ConfigDef.Type.BOOLEAN; + //This has been set to true to preserve the existing behavior. However, we can set it to false if field based partitioning is used in BigQuery + public static final Boolean BIGQUERY_PARTITION_DECORATOR_DEFAULT = true; + private static final ConfigDef.Importance BIGQUERY_PARTITION_DECORATOR_IMPORTANCE = + ConfigDef.Importance.HIGH; + private static final String BIGQUERY_PARTITION_DECORATOR_DOC = + "Whether or not to append partition decorator to BigQuery table name when inserting records. " + + "Default is true. Setting this to true appends partition decorator to table name (e.g. table$yyyyMMdd depending on the configuration set for bigQueryPartitionDecorator). " + + "Setting this to false bypasses the logic to append the partition decorator and uses raw table name for inserts."; + + public static final String BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG = "timestampPartitionFieldName"; + private static final ConfigDef.Type BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_TYPE = ConfigDef.Type.STRING; + private static final String BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_DEFAULT = null; + private static final ConfigDef.Validator BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_VALIDATOR = new ConfigDef.NonEmptyString(); + private static final ConfigDef.Importance BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_IMPORTANCE = + ConfigDef.Importance.LOW; + private static final String BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_DOC = + "The name of the field in the value that contains the timestamp to partition by in BigQuery" + + " and enable timestamp partitioning for each table. Leave this configuration blank," + + " to enable ingestion time partitioning for each table."; + + public static final String BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG = "clusteringPartitionFieldNames"; + private static final ConfigDef.Type BIGQUERY_CLUSTERING_FIELD_NAMES_TYPE = ConfigDef.Type.LIST; + private static final List BIGQUERY_CLUSTERING_FIELD_NAMES_DEFAULT = null; + private static final ConfigDef.Validator BIGQUERY_CLUSTERING_FIELD_NAMES_VALIDATOR = (name, value) -> { + if (value == null) { + return; + } + + @SuppressWarnings("unchecked") + List parsedValue = (List) value; + if (parsedValue.size() > 4) { + throw new ConfigException(name, value, "You may only specify up to four clustering field names."); + } + }; + private static final ConfigDef.Importance BIGQUERY_CLUSTERING_FIELD_NAMES_IMPORTANCE = + ConfigDef.Importance.LOW; + private static final String BIGQUERY_CLUSTERING_FIELD_NAMES_DOC = + "List of fields on which data should be clustered by in BigQuery, separated by commas"; + + public static final String TIME_PARTITIONING_TYPE_CONFIG = "timePartitioningType"; + private static final ConfigDef.Type TIME_PARTITIONING_TYPE_TYPE = ConfigDef.Type.STRING; + public static final String TIME_PARTITIONING_TYPE_DEFAULT = TimePartitioning.Type.DAY.name().toUpperCase(); + public static final String TIME_PARTITIONING_TYPE_NONE = "NONE"; + private static final ConfigDef.Importance TIME_PARTITIONING_TYPE_IMPORTANCE = ConfigDef.Importance.LOW; + private static final List TIME_PARTITIONING_TYPES = Stream.concat( + Stream.of(TimePartitioning.Type.values()).map(TimePartitioning.Type::name), + Stream.of(TIME_PARTITIONING_TYPE_NONE)) + .collect(Collectors.toList()); + private static final String TIME_PARTITIONING_TYPE_DOC = + "The time partitioning type to use when creating tables, or '" + + TIME_PARTITIONING_TYPE_NONE + "' to create non-partitioned tables. " + + "Existing tables will not be altered to use this partitioning type."; + + public static final String BIGQUERY_PARTITION_EXPIRATION_CONFIG = "partitionExpirationMs"; + private static final ConfigDef.Type BIGQUERY_PARTITION_EXPIRATION_TYPE = ConfigDef.Type.LONG; + private static final String BIGQUERY_PARTITION_EXPIRATION_DEFAULT = null; + private static final ConfigDef.Validator BIGQUERY_PARTITION_EXPIRATION_VALIDATOR = (name, value) -> { + if (value != null) { + ConfigDef.Range.atLeast(1).ensureValid(name, value); + } + }; + private static final ConfigDef.Importance BIGQUERY_PARTITION_EXPIRATION_IMPORTANCE = ConfigDef.Importance.LOW; + private static final String BIGQUERY_PARTITION_EXPIRATION_DOC = + "The amount of time, in milliseconds, after which partitions should be deleted from the tables this " + + "connector creates. If this field is set, all data in partitions in this connector's tables that are " + + "older than the specified partition expiration time will be permanently deleted. " + + "Existing tables will not be altered to use this partition expiration time."; + + //This config determines where the connector is hosted (Confluent Cloud or Confluent Platform). + //This is not enforced and defaulted to "Confluent Platform". Currently, it is only used for user-agent tracking in GCP. + public static final String CONNECTOR_RUNTIME_PROVIDER_CONFIG = "runtimeProvider"; + private static final ConfigDef.Type CONNECTOR_RUNTIME_PROVIDER_TYPE = ConfigDef.Type.STRING; + public static final String CONNECTOR_RUNTIME_PROVIDER_DEFAULT = "Confluent Platform"; + private static final ConfigDef.Importance CONNECTOR_RUNTIME_PROVIDER_IMPORTANCE = ConfigDef.Importance.LOW; + public static final List CONNECTOR_RUNTIME_PROVIDER_TYPES = Stream.of("Confluent Platform", "Confluent Cloud") + .collect(Collectors.toList()); + + /** + * Return the ConfigDef object used to define this config's fields. + * + * @return The ConfigDef object used to define this config's fields. + */ + public static ConfigDef getConfig() { + return new ConfigDef() .define( TOPICS_CONFIG, TOPICS_TYPE, @@ -322,6 +619,13 @@ public class BigQuerySinkConfig extends AbstractConfig { SANITIZE_TOPICS_IMPORTANCE, SANITIZE_TOPICS_DOC ).define( + TOPIC2TABLE_MAP_CONFIG, + TOPIC2TABLE_MAP_TYPE, + TOPIC2TABLE_MAP_DEFAULT, + TOPIC2TABLE_MAP_VALIDATOR, + TOPIC2TABLE_MAP_IMPORTANCE, + TOPIC2TABLE_MAP_DOC + ).define( SANITIZE_FIELD_NAME_CONFIG, SANITIZE_FIELD_NAME_TYPE, SANITIZE_FIELD_NAME_DEFAULT, @@ -331,12 +635,14 @@ public class BigQuerySinkConfig extends AbstractConfig { KAFKA_KEY_FIELD_NAME_CONFIG, KAFKA_KEY_FIELD_NAME_TYPE, KAFKA_KEY_FIELD_NAME_DEFAULT, + KAFKA_KEY_FIELD_NAME_VALIDATOR, KAFKA_KEY_FIELD_NAME_IMPORTANCE, KAFKA_KEY_FIELD_NAME_DOC ).define( KAFKA_DATA_FIELD_NAME_CONFIG, KAFKA_DATA_FIELD_NAME_TYPE, KAFKA_DATA_FIELD_NAME_DEFAULT, + KAFKA_DATA_FIELD_NAME_VALIDATOR, KAFKA_DATA_FIELD_NAME_IMPORTANCE, KAFKA_DATA_FIELD_NAME_DOC ).define( @@ -382,36 +688,217 @@ public class BigQuerySinkConfig extends AbstractConfig { ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_DEFAULT, ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_IMPORTANCE, ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_DOC + ).define( + ALLOW_SCHEMA_UNIONIZATION_CONFIG, + ALLOW_SCHEMA_UNIONIZATION_TYPE, + ALLOW_SCHEMA_UNIONIZATION_DEFAULT, + ALLOW_SCHEMA_UNIONIZATION_IMPORTANCE, + ALLOW_SCHEMA_UNIONIZATION_DOC + ).define( + UPSERT_ENABLED_CONFIG, + UPSERT_ENABLED_TYPE, + UPSERT_ENABLED_DEFAULT, + UPSERT_ENABLED_IMPORTANCE, + UPSERT_ENABLED_DOC + ).define( + DELETE_ENABLED_CONFIG, + DELETE_ENABLED_TYPE, + DELETE_ENABLED_DEFAULT, + DELETE_ENABLED_IMPORTANCE, + DELETE_ENABLED_DOC + ).define( + INTERMEDIATE_TABLE_SUFFIX_CONFIG, + INTERMEDIATE_TABLE_SUFFIX_TYPE, + INTERMEDIATE_TABLE_SUFFIX_DEFAULT, + INTERMEDIATE_TABLE_SUFFIX_VALIDATOR, + INTERMEDIATE_TABLE_SUFFIX_IMPORTANCE, + INTERMEDIATE_TABLE_SUFFIX_DOC + ).define( + MERGE_INTERVAL_MS_CONFIG, + MERGE_INTERVAL_MS_TYPE, + MERGE_INTERVAL_MS_DEFAULT, + MERGE_INTERVAL_MS_VALIDATOR, + MERGE_INTERVAL_MS_IMPORTANCE, + MERGE_INTERVAL_MS_DOC + ).define( + MERGE_RECORDS_THRESHOLD_CONFIG, + MERGE_RECORDS_THRESHOLD_TYPE, + MERGE_RECORDS_THRESHOLD_DEFAULT, + MERGE_RECORDS_THRESHOLD_VALIDATOR, + MERGE_RECORDS_THRESHOLD_IMPORTANCE, + MERGE_RECORDS_THRESHOLD_DOC + ).define( + THREAD_POOL_SIZE_CONFIG, + THREAD_POOL_SIZE_TYPE, + THREAD_POOL_SIZE_DEFAULT, + THREAD_POOL_SIZE_VALIDATOR, + THREAD_POOL_SIZE_IMPORTANCE, + THREAD_POOL_SIZE_DOC + ).define( + QUEUE_SIZE_CONFIG, + QUEUE_SIZE_TYPE, + QUEUE_SIZE_DEFAULT, + QUEUE_SIZE_VALIDATOR, + QUEUE_SIZE_IMPORTANCE, + QUEUE_SIZE_DOC + ).define( + BIGQUERY_RETRY_CONFIG, + BIGQUERY_RETRY_TYPE, + BIGQUERY_RETRY_DEFAULT, + BIGQUERY_RETRY_VALIDATOR, + BIGQUERY_RETRY_IMPORTANCE, + BIGQUERY_RETRY_DOC + ).define( + BIGQUERY_RETRY_WAIT_CONFIG, + BIGQUERY_RETRY_WAIT_CONFIG_TYPE, + BIGQUERY_RETRY_WAIT_DEFAULT, + BIGQUERY_RETRY_WAIT_VALIDATOR, + BIGQUERY_RETRY_WAIT_IMPORTANCE, + BIGQUERY_RETRY_WAIT_DOC + ).define( + BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG, + BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG_TYPE, + BIGQUERY_MESSAGE_TIME_PARTITIONING_DEFAULT, + BIGQUERY_MESSAGE_TIME_PARTITIONING_IMPORTANCE, + BIGQUERY_MESSAGE_TIME_PARTITIONING_DOC + ).define( + BIGQUERY_PARTITION_DECORATOR_CONFIG, + BIGQUERY_PARTITION_DECORATOR_CONFIG_TYPE, + BIGQUERY_PARTITION_DECORATOR_DEFAULT, + BIGQUERY_PARTITION_DECORATOR_IMPORTANCE, + BIGQUERY_PARTITION_DECORATOR_DOC + ).define( + BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG, + BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_TYPE, + BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_DEFAULT, + BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_VALIDATOR, + BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_IMPORTANCE, + BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_DOC + ).define( + BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG, + BIGQUERY_CLUSTERING_FIELD_NAMES_TYPE, + BIGQUERY_CLUSTERING_FIELD_NAMES_DEFAULT, + BIGQUERY_CLUSTERING_FIELD_NAMES_VALIDATOR, + BIGQUERY_CLUSTERING_FIELD_NAMES_IMPORTANCE, + BIGQUERY_CLUSTERING_FIELD_NAMES_DOC + ).define( + TIME_PARTITIONING_TYPE_CONFIG, + TIME_PARTITIONING_TYPE_TYPE, + TIME_PARTITIONING_TYPE_DEFAULT, + (name, value) -> { + if (value == null) { + return; + } + String[] validStrings = TIME_PARTITIONING_TYPES.stream().map(String::toLowerCase).toArray(String[]::new); + String lowercaseValue = ((String) value).toLowerCase(); + ConfigDef.ValidString.in(validStrings).ensureValid(name, lowercaseValue); + }, + TIME_PARTITIONING_TYPE_IMPORTANCE, + TIME_PARTITIONING_TYPE_DOC, + "", + -1, + ConfigDef.Width.NONE, + TIME_PARTITIONING_TYPE_CONFIG, + new ConfigDef.Recommender() { + @Override + public List validValues(String s, Map map) { + // Construct a new list to transform from List to List + return new ArrayList<>(TIME_PARTITIONING_TYPES); + } + + @Override + public boolean visible(String s, Map map) { + return true; + } + } + ).define( + BIGQUERY_PARTITION_EXPIRATION_CONFIG, + BIGQUERY_PARTITION_EXPIRATION_TYPE, + BIGQUERY_PARTITION_EXPIRATION_DEFAULT, + BIGQUERY_PARTITION_EXPIRATION_VALIDATOR, + BIGQUERY_PARTITION_EXPIRATION_IMPORTANCE, + BIGQUERY_PARTITION_EXPIRATION_DOC + ).defineInternal( + CONNECTOR_RUNTIME_PROVIDER_CONFIG, + CONNECTOR_RUNTIME_PROVIDER_TYPE, + CONNECTOR_RUNTIME_PROVIDER_DEFAULT, + CONNECTOR_RUNTIME_PROVIDER_IMPORTANCE ); } - /** - * Throw an exception if the passed-in properties do not constitute a valid sink. - * @param props sink configuration properties - */ - public static void validate(Map props) { - final boolean hasTopicsConfig = hasTopicsConfig(props); - final boolean hasTopicsRegexConfig = hasTopicsRegexConfig(props); - - if (hasTopicsConfig && hasTopicsRegexConfig) { - throw new ConfigException(TOPICS_CONFIG + " and " + TOPICS_REGEX_CONFIG + - " are mutually exclusive options, but both are set."); - } - if (!hasTopicsConfig && !hasTopicsRegexConfig) { - throw new ConfigException("Must configure one of " + - TOPICS_CONFIG + " or " + TOPICS_REGEX_CONFIG); - } - } + private static final List> MULTI_PROPERTY_VALIDATIONS = new ArrayList<>(); - public static boolean hasTopicsConfig(Map props) { - String topicsStr = props.get(TOPICS_CONFIG); - return topicsStr != null && !topicsStr.trim().isEmpty(); - } + static { + // Note that order matters here: validations are performed in the order they're added to this list, and if a + // property or any of the properties that it depends on has an error, validation for it gets skipped. + // This comes in handy for things like checking for the existence of tables, which requires valid BigQuery + // credentials. We validate those credentials before checking for tables so that we can safely assume while + // checking for those tables that the credentials are already valid. + MULTI_PROPERTY_VALIDATIONS.add(new CredentialsValidator.BigQueryCredentialsValidator()); + MULTI_PROPERTY_VALIDATIONS.add(new CredentialsValidator.GcsCredentialsValidator()); + MULTI_PROPERTY_VALIDATIONS.add(new GcsBucketValidator()); + MULTI_PROPERTY_VALIDATIONS.add(new PartitioningModeValidator()); + MULTI_PROPERTY_VALIDATIONS.add(new PartitioningTypeValidator()); + MULTI_PROPERTY_VALIDATIONS.add(new UpsertDeleteValidator.UpsertValidator()); + MULTI_PROPERTY_VALIDATIONS.add(new UpsertDeleteValidator.DeleteValidator()); + } - public static boolean hasTopicsRegexConfig(Map props) { - String topicsRegexStr = props.get(TOPICS_REGEX_CONFIG); - return topicsRegexStr != null && !topicsRegexStr.trim().isEmpty(); + /** + * Used in conjunction with {@link com.wepay.kafka.connect.bigquery.BigQuerySinkConnector#validate(Map)} to perform + * preflight configuration checks. Simple validations that only require a single property value at a time (such as + * ensuring that boolean properties only contain true/false values, or that values for required properties are + * provided) are handled automatically by the {@link #getConfig() ConfigDef} for this class and optionally-defined + * custom {@link ConfigDef.Validator validators}. Other, more sophisticated validations that require multiple + * property values at a time (such as checking if all of the tables the connector will write to already exist if + * automatic table creation is disabled) are performed manually in a subsequent step. + * + * @return a {@link Config} object containing all errors that the connector was able to detect during preflight + * validation of this configuration; never null + */ + public Config validate() { + List initialValidation = getConfig().validate(originalsStrings()); + Map valuesByName = initialValidation + .stream() + .collect(Collectors.toMap(ConfigValue::name, Function.identity())); + MULTI_PROPERTY_VALIDATIONS.forEach(validator -> { + ConfigValue value = valuesByName.get(validator.propertyName()); + validator.validate(value, this, valuesByName).ifPresent(value::addErrorMessage); + }); + return new Config(initialValidation); + } + + /** + * @return the key, which is (depending on the key source property) either a path to a file or a raw JSON string + */ + public String getKey() { + return Optional.ofNullable(getPassword(KEYFILE_CONFIG)).map(Password::value).orElse(null); + } + + /** + * @return the {@link com.wepay.kafka.connect.bigquery.GcpClientBuilder.KeySource key source type} that dictates how + * the {@link #getKey()} should be be interpreted + */ + public GcpClientBuilder.KeySource getKeySource() { + String rawKeySource = getString(KEY_SOURCE_CONFIG); + try { + return GcpClientBuilder.KeySource.valueOf(rawKeySource); + } catch (IllegalArgumentException e) { + // Should never happen with preflight validation of the key source property + throw new ConnectException("Invalid key source type: " + rawKeySource); } + } + + public static boolean upsertDeleteEnabled(Map props) { + String upsertStr = props.get(UPSERT_ENABLED_CONFIG); + String deleteStr = props.get(DELETE_ENABLED_CONFIG); + return Boolean.TRUE.toString().equalsIgnoreCase(upsertStr) + || Boolean.TRUE.toString().equalsIgnoreCase(deleteStr); + } + + public static boolean gcsBatchLoadingEnabled(Map props) { + String batchLoadStr = props.get(ENABLE_BATCH_CONFIG); + return batchLoadStr != null && !batchLoadStr.isEmpty(); + } /** * Returns the keyfile @@ -425,7 +912,9 @@ public String getKeyFile() { * @return a {@link SchemaConverter} for BigQuery. */ public SchemaConverter getSchemaConverter() { - return new BigQuerySchemaConverter(getBoolean(ALL_BQ_FIELDS_NULLABLE_CONFIG)); + return new BigQuerySchemaConverter( + getBoolean(ALL_BQ_FIELDS_NULLABLE_CONFIG), + getBoolean(SANITIZE_FIELD_NAME_CONFIG)); } /** @@ -464,7 +953,7 @@ public SchemaRetriever getSchemaRetriever() { Class schemaRetrieverClass = userSpecifiedClass.asSubclass(SchemaRetriever.class); - Constructor schemaRetrieverConstructor = null; + Constructor schemaRetrieverConstructor; try { schemaRetrieverConstructor = schemaRetrieverClass.getConstructor(); } catch (NoSuchMethodException nsme) { @@ -474,7 +963,7 @@ public SchemaRetriever getSchemaRetriever() { ); } - SchemaRetriever schemaRetriever = null; + SchemaRetriever schemaRetriever; try { schemaRetriever = schemaRetrieverConstructor.newInstance(); } catch (InstantiationException @@ -493,7 +982,6 @@ public SchemaRetriever getSchemaRetriever() { } /** - * * If the connector is configured to load Kafka data into BigQuery, this config defines * the name of the kafka data field. A structure is created under the field name to contain * kafka data schema including topic, offset, partition and insertTime. @@ -505,7 +993,6 @@ public Optional getKafkaKeyFieldName() { } /** - * * If the connector is configured to load Kafka keys into BigQuery, this config defines * the name of the kafka key field. A structure is created under the field name to contain * a topic's Kafka key schema. @@ -516,59 +1003,86 @@ public Optional getKafkaDataFieldName() { return Optional.ofNullable(getString(KAFKA_DATA_FIELD_NAME_CONFIG)); } - /** - * Verifies that a bucket is specified if GCS batch loading is enabled. - * @throws ConfigException Exception thrown if no bucket is specified and batch loading is on. - */ - private void verifyBucketSpecified() throws ConfigException { - // Throw an exception if GCS Batch loading will be used but no bucket is specified - if (getString(GCS_BUCKET_NAME_CONFIG).equals("") - && !getList(ENABLE_BATCH_CONFIG).isEmpty()) { - throw new ConfigException("Batch loading enabled for some topics, but no bucket specified"); - } + public boolean isUpsertDeleteEnabled() { + return getBoolean(UPSERT_ENABLED_CONFIG) || getBoolean(DELETE_ENABLED_CONFIG); } - private void checkAutoCreateTables() { + public Optional getTimePartitioningType() { + return parseTimePartitioningType(getString(TIME_PARTITIONING_TYPE_CONFIG)); + } + + public Optional> getTopic2TableMap() { + return Optional.ofNullable(parseTopic2TableMapConfig(getString(TOPIC2TABLE_MAP_CONFIG))); + } + + private Optional parseTimePartitioningType(String rawPartitioningType) { + if (rawPartitioningType == null) { + throw new ConfigException(TIME_PARTITIONING_TYPE_CONFIG, + rawPartitioningType, + "Must be one of " + String.join(", ", TIME_PARTITIONING_TYPES)); + } - Class schemaRetriever = getClass(BigQuerySinkConfig.SCHEMA_RETRIEVER_CONFIG); - boolean autoCreateTables = getBoolean(TABLE_CREATE_CONFIG); + if (TIME_PARTITIONING_TYPE_NONE.equals(rawPartitioningType)) { + return Optional.empty(); + } - if (autoCreateTables && schemaRetriever == null) { + try { + return Optional.of(TimePartitioning.Type.valueOf(rawPartitioningType)); + } catch (IllegalArgumentException e) { throw new ConfigException( - "Cannot specify automatic table creation without a schema retriever" - ); + TIME_PARTITIONING_TYPE_CONFIG, + rawPartitioningType, + "Must be one of " + String.join(", ", TIME_PARTITIONING_TYPES)); } } - private void checkBigQuerySchemaUpdateConfigs() { - boolean allBQFieldsNullable = getBoolean(ALL_BQ_FIELDS_NULLABLE_CONFIG); - boolean allowBQRequiredFieldRelaxation = getBoolean(ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG); - if (allBQFieldsNullable && !allowBQRequiredFieldRelaxation) { - throw new ConfigException( - "Conflicting Configs, allBQFieldsNullable can be true only if allowBigQueryFieldRelaxation is true" - ); + private Map parseTopic2TableMapConfig(String topic2TableMapString) { + if (topic2TableMapString.isEmpty()) { + return null; + } + Map topic2TableMap = new HashMap<>(); + // It's already validated, so we can just populate the map + for (String str : topic2TableMapString.split(",")) { + String[] tt = str.split(":"); + String topic = tt[0].trim(); + String table = tt[1].trim(); + topic2TableMap.put(topic, table); } + return topic2TableMap.isEmpty() ? null : topic2TableMap; } /** - * Return the ConfigDef object used to define this config's fields. - * - * @return The ConfigDef object used to define this config's fields. + * Returns the partition expiration in ms. + * @return Long that represents the partition expiration. */ - public static ConfigDef getConfig() { - return config; + public Optional getPartitionExpirationMs() { + return Optional.ofNullable(getLong(BIGQUERY_PARTITION_EXPIRATION_CONFIG)); + } + + /** + * Returns the field name to use for timestamp partitioning. + * @return String that represents the field name. + */ + public Optional getTimestampPartitionFieldName() { + return Optional.ofNullable(getString(BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG)); + } + + /** + * Returns the field names to use for clustering. + * @return List of Strings that represent the field names. + */ + public Optional> getClusteringPartitionFieldNames() { + return Optional + .ofNullable(getList(BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG)) + // With Java 11 there's Predicate::not, but for now we have to just manually invert the isEmpty check + .filter(l -> !l.isEmpty()); } protected BigQuerySinkConfig(ConfigDef config, Map properties) { super(config, properties); - verifyBucketSpecified(); } public BigQuerySinkConfig(Map properties) { - super(config, properties); - verifyBucketSpecified(); - checkAutoCreateTables(); - checkBigQuerySchemaUpdateConfigs(); + this(getConfig(), properties); } - } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkTaskConfig.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkTaskConfig.java index 9b395700b..9550d7410 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkTaskConfig.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkTaskConfig.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.config; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,14 +17,9 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.config; -import java.util.List; -import java.util.Optional; import org.apache.kafka.common.config.ConfigDef; -import org.apache.kafka.common.config.ConfigException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.util.Map; @@ -32,226 +27,40 @@ * Class for task-specific configuration properties. */ public class BigQuerySinkTaskConfig extends BigQuerySinkConfig { - private static final ConfigDef config; - private static final Logger logger = LoggerFactory.getLogger(BigQuerySinkTaskConfig.class); - - public static final String THREAD_POOL_SIZE_CONFIG = "threadPoolSize"; - private static final ConfigDef.Type THREAD_POOL_SIZE_TYPE = ConfigDef.Type.INT; - public static final Integer THREAD_POOL_SIZE_DEFAULT = 10; - private static final ConfigDef.Validator THREAD_POOL_SIZE_VALIDATOR = ConfigDef.Range.atLeast(1); - private static final ConfigDef.Importance THREAD_POOL_SIZE_IMPORTANCE = - ConfigDef.Importance.MEDIUM; - private static final String THREAD_POOL_SIZE_DOC = - "The size of the BigQuery write thread pool. This establishes the maximum number of " - + "concurrent writes to BigQuery."; - - public static final String QUEUE_SIZE_CONFIG = "queueSize"; - private static final ConfigDef.Type QUEUE_SIZE_TYPE = ConfigDef.Type.LONG; - // should this even have a default? - public static final Long QUEUE_SIZE_DEFAULT = -1L; - private static final ConfigDef.Validator QUEUE_SIZE_VALIDATOR = ConfigDef.Range.atLeast(-1); - private static final ConfigDef.Importance QUEUE_SIZE_IMPORTANCE = ConfigDef.Importance.HIGH; - private static final String QUEUE_SIZE_DOC = - "The maximum size (or -1 for no maximum size) of the worker queue for bigQuery write " - + "requests before all topics are paused. This is a soft limit; the size of the queue can " - + "go over this before topics are paused. All topics will be resumed once a flush is " - + "requested or the size of the queue drops under half of the maximum size."; - - public static final String BIGQUERY_RETRY_CONFIG = "bigQueryRetry"; - private static final ConfigDef.Type BIGQUERY_RETRY_TYPE = ConfigDef.Type.INT; - public static final Integer BIGQUERY_RETRY_DEFAULT = 0; - private static final ConfigDef.Validator BIGQUERY_RETRY_VALIDATOR = ConfigDef.Range.atLeast(0); - private static final ConfigDef.Importance BIGQUERY_RETRY_IMPORTANCE = - ConfigDef.Importance.MEDIUM; - private static final String BIGQUERY_RETRY_DOC = - "The number of retry attempts that will be made per BigQuery request that fails with a " - + "backend error or a quota exceeded error"; - - public static final String BIGQUERY_RETRY_WAIT_CONFIG = "bigQueryRetryWait"; - private static final ConfigDef.Type BIGQUERY_RETRY_WAIT_CONFIG_TYPE = ConfigDef.Type.LONG; - public static final Long BIGQUERY_RETRY_WAIT_DEFAULT = 1000L; - private static final ConfigDef.Validator BIGQUERY_RETRY_WAIT_VALIDATOR = - ConfigDef.Range.atLeast(0); - private static final ConfigDef.Importance BIGQUERY_RETRY_WAIT_IMPORTANCE = - ConfigDef.Importance.MEDIUM; - private static final String BIGQUERY_RETRY_WAIT_DOC = - "The minimum amount of time, in milliseconds, to wait between BigQuery backend or quota " - + "exceeded error retry attempts."; - - public static final String BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG = - "bigQueryMessageTimePartitioning"; - private static final ConfigDef.Type BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG_TYPE = - ConfigDef.Type.BOOLEAN; - public static final Boolean BIGQUERY_MESSAGE_TIME_PARTITIONING_DEFAULT = false; - private static final ConfigDef.Importance BIGQUERY_MESSAGE_TIME_PARTITIONING_IMPORTANCE = - ConfigDef.Importance.HIGH; - private static final String BIGQUERY_MESSAGE_TIME_PARTITIONING_DOC = - "Whether or not to use the message time when inserting records. " - + "Default uses the connector processing time."; - public static final String BIGQUERY_PARTITION_DECORATOR_CONFIG = - "bigQueryPartitionDecorator"; - private static final ConfigDef.Type BIGQUERY_PARTITION_DECORATOR_CONFIG_TYPE = - ConfigDef.Type.BOOLEAN; - //This has been set to true to preserve the existing behavior. However, we can set it to false if field based partitioning is used in BigQuery - public static final Boolean BIGQUERY_PARTITION_DECORATOR_DEFAULT = true; - private static final ConfigDef.Importance BIGQUERY_PARTITION_DECORATOR_IMPORTANCE = - ConfigDef.Importance.HIGH; - private static final String BIGQUERY_PARTITION_DECORATOR_DOC = - "Whether or not to append partition decorator to BigQuery table name when inserting records. " - + "Default is true. Setting this to true appends partition decorator to table name (e.g. table$yyyyMMdd depending on the configuration set for bigQueryPartitionDecorator). " - + "Setting this to false bypasses the logic to append the partition decorator and uses raw table name for inserts."; - - public static final String BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG = "timestampPartitionFieldName"; - private static final ConfigDef.Type BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_TYPE = ConfigDef.Type.STRING; - private static final String BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_DEFAULT = null; - private static final ConfigDef.Importance BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_IMPORTANCE = - ConfigDef.Importance.LOW; - private static final String BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_DOC = - "The name of the field in the value that contains the timestamp to partition by in BigQuery" - + " and enable timestamp partitioning for each table. Leave this configuration blank," - + " to enable ingestion time partitioning for each table."; - - public static final String BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG = "clusteringPartitionFieldNames"; - private static final ConfigDef.Type BIGQUERY_CLUSTERING_FIELD_NAMES_TYPE = ConfigDef.Type.LIST; - private static final List BIGQUERY_CLUSTERING_FIELD_NAMES_DEFAULT = null; - private static final ConfigDef.Importance BIGQUERY_CLUSTERING_FIELD_NAMES_IMPORTANCE = - ConfigDef.Importance.LOW; - private static final String BIGQUERY_CLUSTERING_FIELD_NAMES_DOC = - "List of fields on which data should be clustered by in BigQuery, separated by commas"; - - static { - config = BigQuerySinkConfig.getConfig() - .define( - THREAD_POOL_SIZE_CONFIG, - THREAD_POOL_SIZE_TYPE, - THREAD_POOL_SIZE_DEFAULT, - THREAD_POOL_SIZE_VALIDATOR, - THREAD_POOL_SIZE_IMPORTANCE, - THREAD_POOL_SIZE_DOC - ).define( - QUEUE_SIZE_CONFIG, - QUEUE_SIZE_TYPE, - QUEUE_SIZE_DEFAULT, - QUEUE_SIZE_VALIDATOR, - QUEUE_SIZE_IMPORTANCE, - QUEUE_SIZE_DOC - ).define( - BIGQUERY_RETRY_CONFIG, - BIGQUERY_RETRY_TYPE, - BIGQUERY_RETRY_DEFAULT, - BIGQUERY_RETRY_VALIDATOR, - BIGQUERY_RETRY_IMPORTANCE, - BIGQUERY_RETRY_DOC - ).define( - BIGQUERY_RETRY_WAIT_CONFIG, - BIGQUERY_RETRY_WAIT_CONFIG_TYPE, - BIGQUERY_RETRY_WAIT_DEFAULT, - BIGQUERY_RETRY_WAIT_VALIDATOR, - BIGQUERY_RETRY_WAIT_IMPORTANCE, - BIGQUERY_RETRY_WAIT_DOC - ).define( - BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG, - BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG_TYPE, - BIGQUERY_MESSAGE_TIME_PARTITIONING_DEFAULT, - BIGQUERY_MESSAGE_TIME_PARTITIONING_IMPORTANCE, - BIGQUERY_MESSAGE_TIME_PARTITIONING_DOC - ).define( - BIGQUERY_PARTITION_DECORATOR_CONFIG, - BIGQUERY_PARTITION_DECORATOR_CONFIG_TYPE, - BIGQUERY_PARTITION_DECORATOR_DEFAULT, - BIGQUERY_PARTITION_DECORATOR_IMPORTANCE, - BIGQUERY_PARTITION_DECORATOR_DOC - ).define( - BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG, - BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_TYPE, - BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_DEFAULT, - BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_IMPORTANCE, - BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_DOC - ).define( - BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG, - BIGQUERY_CLUSTERING_FIELD_NAMES_TYPE, - BIGQUERY_CLUSTERING_FIELD_NAMES_DEFAULT, - BIGQUERY_CLUSTERING_FIELD_NAMES_IMPORTANCE, - BIGQUERY_CLUSTERING_FIELD_NAMES_DOC - ); - } + public static final String GCS_BQ_TASK_CONFIG = "GCSBQTask"; + private static final ConfigDef.Type GCS_BQ_TASK_TYPE = ConfigDef.Type.BOOLEAN; + private static final boolean GCS_BQ_TASK_DEFAULT = false; + private static final ConfigDef.Importance GCS_BQ_TASK_IMPORTANCE = ConfigDef.Importance.LOW; - private void checkSchemaUpdates() { - Class schemaRetriever = getClass(BigQuerySinkConfig.SCHEMA_RETRIEVER_CONFIG); - - boolean allowNewBigQueryFields = getBoolean(BigQuerySinkConfig.ALLOW_NEW_BIGQUERY_FIELDS_CONFIG); - boolean allowRequiredFieldRelaxation = getBoolean(BigQuerySinkConfig.ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG); - if ((allowNewBigQueryFields || allowRequiredFieldRelaxation) && schemaRetriever == null) { - throw new ConfigException( - "Cannot perform schema updates without a schema retriever" - ); - } - - if (schemaRetriever == null) { - logger.warn( - "No schema retriever class provided; auto schema updates are impossible" - ); - } - } - - /** - * Returns the field name to use for timestamp partitioning. - * @return String that represents the field name. - */ - public Optional getTimestampPartitionFieldName() { - return Optional.ofNullable(getString(BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG)); - } - - /** - * Returns the field names to use for clustering. - * @return List of Strings that represent the field names. - */ - public Optional> getClusteringPartitionFieldName() { - return Optional.ofNullable(getList(BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG)); - } - - /** - * Check the validity of table partitioning configs. - */ - private void checkPartitionConfigs() { - if (getTimestampPartitionFieldName().isPresent() && getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)) { - throw new ConfigException( - "Only one partitioning configuration mode may be specified for the connector. " - + "Use either bigQueryPartitionDecorator OR timestampPartitionFieldName." - ); - } - } + public static final String TASK_ID_CONFIG = "taskId"; + private static final ConfigDef.Type TASK_ID_TYPE = ConfigDef.Type.INT; + public static final ConfigDef.Importance TASK_ID_IMPORTANCE = ConfigDef.Importance.LOW; /** - * Check the validity of table clustering configs. + * Return a ConfigDef object used to define this config's fields. + * + * @return A ConfigDef object used to define this config's fields. */ - private void checkClusteringConfigs() { - if (getClusteringPartitionFieldName().isPresent()) { - if (!getTimestampPartitionFieldName().isPresent() && !getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)) { - throw new ConfigException( - "Clustering field name may be specified only on a partitioned table." - ); - } - if (getClusteringPartitionFieldName().get().size() > 4) { - throw new ConfigException( - "You can only specify up to four clustering field names." + public static ConfigDef config() { + return BigQuerySinkConfig.getConfig() + .defineInternal( + GCS_BQ_TASK_CONFIG, + GCS_BQ_TASK_TYPE, + GCS_BQ_TASK_DEFAULT, + GCS_BQ_TASK_IMPORTANCE + ).defineInternal( + TASK_ID_CONFIG, + TASK_ID_TYPE, + ConfigDef.NO_DEFAULT_VALUE, + TASK_ID_IMPORTANCE ); - } - } - } - - public static ConfigDef getConfig() { - return config; } /** * @param properties A Map detailing configuration properties and their respective values. */ public BigQuerySinkTaskConfig(Map properties) { - super(config, properties); - checkSchemaUpdates(); - checkPartitionConfigs(); - checkClusteringConfigs(); + super(config(), properties); } } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/CredentialsValidator.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/CredentialsValidator.java new file mode 100644 index 000000000..0aa8d6d89 --- /dev/null +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/CredentialsValidator.java @@ -0,0 +1,126 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.config; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.storage.Storage; +import com.wepay.kafka.connect.bigquery.GcpClientBuilder; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Optional; + +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.ENABLE_BATCH_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.GCS_BUCKET_NAME_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.KEYFILE_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.KEY_SOURCE_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.PROJECT_CONFIG; +import static com.wepay.kafka.connect.bigquery.GcpClientBuilder.KeySource; + +public abstract class CredentialsValidator> extends MultiPropertyValidator { + + public CredentialsValidator() { + super(KEYFILE_CONFIG); + } + + private static final Collection DEPENDENTS = Collections.unmodifiableCollection(Arrays.asList( + PROJECT_CONFIG, KEY_SOURCE_CONFIG + )); + + @Override + protected Collection dependents() { + return DEPENDENTS; + } + + @Override + protected Optional doValidate(BigQuerySinkConfig config) { + String keyFile = config.getKey(); + KeySource keySource = config.getKeySource(); + + if (keySource == KeySource.APPLICATION_DEFAULT && keyFile != null && !keyFile.isEmpty()) { + String errorMessage = KEYFILE_CONFIG + " should not be provided if " + KEY_SOURCE_CONFIG + + " is " + KeySource.APPLICATION_DEFAULT; + return Optional.of(errorMessage); + } + + if ((keyFile == null || keyFile.isEmpty()) && config.getKeySource() != GcpClientBuilder.KeySource.APPLICATION_DEFAULT) { + // No credentials to validate + return Optional.empty(); + } + + try { + clientBuilder() + .withConfig(config) + .build(); + return Optional.empty(); + } catch (RuntimeException e) { + String errorMessage = "An unexpected error occurred while validating credentials for " + gcpService(); + if (e.getMessage() != null) { + errorMessage += ": " + e.getMessage(); + } + return Optional.of(errorMessage); + } + } + + protected abstract String gcpService(); + protected abstract ClientBuilder clientBuilder(); + + public static class BigQueryCredentialsValidator extends CredentialsValidator> { + @Override + public String gcpService() { + return "BigQuery"; + } + + @Override + protected GcpClientBuilder clientBuilder() { + return new GcpClientBuilder.BigQueryBuilder(); + } + } + + public static class GcsCredentialsValidator extends CredentialsValidator> { + + private static final Collection DEPENDENTS; + + static { + List dependents = new ArrayList<>(CredentialsValidator.DEPENDENTS); + dependents.add(ENABLE_BATCH_CONFIG); + dependents.add(GCS_BUCKET_NAME_CONFIG); + DEPENDENTS = Collections.unmodifiableCollection(dependents); + } + + @Override + public Collection dependents() { + return DEPENDENTS; + } + + @Override + public String gcpService() { + return "GCS"; + } + + @Override + protected GcpClientBuilder clientBuilder() { + return new GcpClientBuilder.GcsBuilder(); + } + } +} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/GcsBucketValidator.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/GcsBucketValidator.java new file mode 100644 index 000000000..59d2dbd44 --- /dev/null +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/GcsBucketValidator.java @@ -0,0 +1,97 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.config; + +import com.google.cloud.storage.Bucket; +import com.google.cloud.storage.Storage; +import com.google.common.annotations.VisibleForTesting; +import com.wepay.kafka.connect.bigquery.GcpClientBuilder; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Optional; + +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.AUTO_CREATE_BUCKET_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.ENABLE_BATCH_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.GCS_BUCKET_NAME_CONFIG; + +public class GcsBucketValidator extends MultiPropertyValidator { + + public GcsBucketValidator() { + super(GCS_BUCKET_NAME_CONFIG); + } + + private static final Collection DEPENDENTS = Collections.unmodifiableCollection(Arrays.asList( + ENABLE_BATCH_CONFIG, AUTO_CREATE_BUCKET_CONFIG + )); + + @Override + protected Collection dependents() { + return DEPENDENTS; + } + + @Override + protected Optional doValidate(BigQuerySinkConfig config) { + Storage gcs; + try { + gcs = new GcpClientBuilder.GcsBuilder() + .withConfig(config) + .build(); + } catch (RuntimeException e) { + return Optional.of(String.format( + "Failed to construct GCS client%s", + e.getMessage() != null ? ": " + e.getMessage() : "" + )); + } + return doValidate(gcs, config); + } + + @VisibleForTesting + Optional doValidate(Storage gcs, BigQuerySinkConfig config) { + List batchLoadedTopics = config.getList(ENABLE_BATCH_CONFIG); + if (batchLoadedTopics == null || batchLoadedTopics.isEmpty()) { + // Batch loading is disabled; no need to validate the GCS bucket + return Optional.empty(); + } + + String bucketName = config.getString(GCS_BUCKET_NAME_CONFIG); + if (bucketName == null || bucketName.trim().isEmpty()) { + return Optional.of("When GCS batch loading is enabled, a bucket must be provided"); + } + + if (config.getBoolean(AUTO_CREATE_BUCKET_CONFIG)) { + return Optional.empty(); + } + + Bucket bucket = gcs.get(bucketName); + if (bucket == null) { + return Optional.of(String.format( + "Automatic bucket creation is disabled but the GCS bucket %s does not exist. " + + "Please either manually create this table before restarting the connector or enable automatic bucket creation " + + "by the connector", + bucketName + )); + } + + return Optional.empty(); + } +} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/MultiPropertyValidator.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/MultiPropertyValidator.java new file mode 100644 index 000000000..95b9c2da6 --- /dev/null +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/MultiPropertyValidator.java @@ -0,0 +1,70 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.config; + +import org.apache.kafka.common.config.ConfigValue; + +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; + +public abstract class MultiPropertyValidator { + + private final String propertyName; + + protected MultiPropertyValidator(String propertyName) { + this.propertyName = propertyName; + } + + public String propertyName() { + return propertyName; + } + + public Optional validate(ConfigValue value, Config config, Map valuesByName) { + // Only perform follow-up validation if the property doesn't already have an error associated with it + if (!value.errorMessages().isEmpty()) { + return Optional.empty(); + } + + boolean dependentsAreValid = dependents().stream() + .map(valuesByName::get) + .filter(Objects::nonNull) + .map(ConfigValue::errorMessages) + .allMatch(List::isEmpty); + // Also ensure that all of the other properties that the validation for this one depends on don't already have errors + if (!dependentsAreValid) { + return Optional.empty(); + } + + try { + return doValidate(config); + } catch (RuntimeException e) { + return Optional.of( + "An unexpected error occurred during validation" + + (e.getMessage() != null ? ": " + e.getMessage() : "") + ); + } + } + + protected abstract Collection dependents(); + protected abstract Optional doValidate(Config config); +} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/PartitioningModeValidator.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/PartitioningModeValidator.java new file mode 100644 index 000000000..65389e5fd --- /dev/null +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/PartitioningModeValidator.java @@ -0,0 +1,60 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.config; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Optional; + +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG; + +public class PartitioningModeValidator extends MultiPropertyValidator { + public PartitioningModeValidator() { + super(BIGQUERY_PARTITION_DECORATOR_CONFIG); + } + + private static final Collection DEPENDENTS = Collections.unmodifiableCollection(Arrays.asList( + BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG + )); + + @Override + protected Collection dependents() { + return DEPENDENTS; + } + + @Override + protected Optional doValidate(BigQuerySinkConfig config) { + if (!config.getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)) { + return Optional.empty(); + } + + if (config.getTimestampPartitionFieldName().isPresent()) { + return Optional.of(String.format("Only one partitioning mode may be specified for the connector. " + + "Use either %s OR %s.", + BIGQUERY_PARTITION_DECORATOR_CONFIG, + BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG + )); + } else { + return Optional.empty(); + } + } +} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/PartitioningTypeValidator.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/PartitioningTypeValidator.java new file mode 100644 index 000000000..0fea75990 --- /dev/null +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/PartitioningTypeValidator.java @@ -0,0 +1,65 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.config; + +import com.google.cloud.bigquery.TimePartitioning; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Optional; + +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.TABLE_CREATE_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.TIME_PARTITIONING_TYPE_CONFIG; + +public class PartitioningTypeValidator extends MultiPropertyValidator { + public PartitioningTypeValidator() { + super(TIME_PARTITIONING_TYPE_CONFIG); + } + + private static final Collection DEPENDENTS = Collections.unmodifiableCollection(Arrays.asList( + BIGQUERY_PARTITION_DECORATOR_CONFIG, TABLE_CREATE_CONFIG + )); + + @Override + protected Collection dependents() { + return DEPENDENTS; + } + + @Override + protected Optional doValidate(BigQuerySinkConfig config) { + if (!config.getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG) || !config.getBoolean(TABLE_CREATE_CONFIG)) { + return Optional.empty(); + } + + Optional timePartitioningType = config.getTimePartitioningType(); + + if (!Optional.of(TimePartitioning.Type.DAY).equals(timePartitioningType)) { + return Optional.of( + "Tables must be partitioned by DAY when using partition decorator syntax. " + + "Either configure the connector with the DAY time partitioning type, " + + "disable automatic table creation, or disable partition decorator syntax." + ); + } + + return Optional.empty(); + } +} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/UpsertDeleteValidator.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/UpsertDeleteValidator.java new file mode 100644 index 000000000..a20178258 --- /dev/null +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/UpsertDeleteValidator.java @@ -0,0 +1,102 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.config; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Optional; + +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.DELETE_ENABLED_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.KAFKA_KEY_FIELD_NAME_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.MERGE_INTERVAL_MS_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.MERGE_RECORDS_THRESHOLD_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.UPSERT_ENABLED_CONFIG; + +public abstract class UpsertDeleteValidator extends MultiPropertyValidator { + private UpsertDeleteValidator(String propertyName) { + super(propertyName); + } + + private static final Collection DEPENDENTS = Collections.unmodifiableCollection(Arrays.asList( + MERGE_INTERVAL_MS_CONFIG, MERGE_RECORDS_THRESHOLD_CONFIG, KAFKA_KEY_FIELD_NAME_CONFIG + )); + + @Override + protected Collection dependents() { + return DEPENDENTS; + } + + @Override + protected Optional doValidate(BigQuerySinkConfig config) { + if (!modeEnabled(config)) { + return Optional.empty(); + } + + long mergeInterval = config.getLong(MERGE_INTERVAL_MS_CONFIG); + long mergeRecordsThreshold = config.getLong(MERGE_RECORDS_THRESHOLD_CONFIG); + + if (mergeInterval == -1 && mergeRecordsThreshold == -1) { + return Optional.of(String.format( + "%s and %s cannot both be -1", + MERGE_INTERVAL_MS_CONFIG, + MERGE_RECORDS_THRESHOLD_CONFIG + )); + } + + if (!config.getKafkaKeyFieldName().isPresent()) { + return Optional.of(String.format( + "%s must be specified when %s is set to true", + KAFKA_KEY_FIELD_NAME_CONFIG, + propertyName() + )); + } + + return Optional.empty(); + } + + /** + * @param config the user-provided configuration + * @return whether the write mode for the validator (i.e., either upsert or delete) is enabled + */ + protected abstract boolean modeEnabled(BigQuerySinkConfig config); + + public static class UpsertValidator extends UpsertDeleteValidator { + public UpsertValidator() { + super(UPSERT_ENABLED_CONFIG); + } + + @Override + protected boolean modeEnabled(BigQuerySinkConfig config) { + return config.getBoolean(UPSERT_ENABLED_CONFIG); + } + } + + public static class DeleteValidator extends UpsertDeleteValidator { + public DeleteValidator() { + super(DELETE_ENABLED_CONFIG); + } + + @Override + protected boolean modeEnabled(BigQuerySinkConfig config) { + return config.getBoolean(DELETE_ENABLED_CONFIG); + } + } +} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/BigQueryRecordConverter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/BigQueryRecordConverter.java index 835935bd8..05725dd1c 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/BigQueryRecordConverter.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/BigQueryRecordConverter.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.convert; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.convert; import com.google.cloud.bigquery.InsertAllRequest.RowToInsert; import com.wepay.kafka.connect.bigquery.api.KafkaSchemaRecordType; @@ -47,12 +48,12 @@ */ public class BigQueryRecordConverter implements RecordConverter> { - private static final Set BASIC_TYPES = new HashSet( + private static final Set> BASIC_TYPES = new HashSet<>( Arrays.asList( Boolean.class, Character.class, Byte.class, Short.class, Integer.class, Long.class, Float.class, Double.class, String.class) ); - private boolean shouldConvertSpecialDouble; + private final boolean shouldConvertSpecialDouble; static { // force registration @@ -72,6 +73,7 @@ public BigQueryRecordConverter(boolean shouldConvertDoubleSpecial) { * @param recordType The type of the record to convert, either value or key. * @return The result BigQuery row content. */ + @SuppressWarnings("unchecked") public Map convertRecord(SinkRecord record, KafkaSchemaRecordType recordType) { Schema kafkaConnectSchema = recordType == KafkaSchemaRecordType.KEY ? record.keySchema() : record.valueSchema(); Object kafkaConnectStruct = recordType == KafkaSchemaRecordType.KEY ? record.key() : record.value(); @@ -89,6 +91,7 @@ public Map convertRecord(SinkRecord record, KafkaSchemaRecordTyp return convertStruct(kafkaConnectStruct, kafkaConnectSchema); } + @SuppressWarnings("unchecked") private Object convertSchemalessRecord(Object value) { if (value == null) { return null; @@ -103,10 +106,9 @@ private Object convertSchemalessRecord(Object value) { return convertBytes(value); } if (value instanceof List) { - return - ((List) value).stream().map( - v -> convertSchemalessRecord(v) - ).collect(Collectors.toList()); + return ((List) value).stream() + .map(this::convertSchemalessRecord) + .collect(Collectors.toList()); } if (value instanceof Map) { return @@ -128,7 +130,6 @@ private Object convertSchemalessRecord(Object value) { " found in schemaless record data. Can't convert record to bigQuery format"); } - @SuppressWarnings("unchecked") private Object convertObject(Object kafkaConnectObject, Schema kafkaConnectSchema) { if (kafkaConnectObject == null) { if (kafkaConnectSchema.isOptional()) { @@ -152,22 +153,16 @@ private Object convertObject(Object kafkaConnectObject, Schema kafkaConnectSchem return convertStruct(kafkaConnectObject, kafkaConnectSchema); case BYTES: return convertBytes(kafkaConnectObject); - case BOOLEAN: - return (Boolean) kafkaConnectObject; - case FLOAT32: - return (Float) kafkaConnectObject; case FLOAT64: return convertDouble((Double)kafkaConnectObject); + case BOOLEAN: + case FLOAT32: case INT8: - return (Byte) kafkaConnectObject; case INT16: - return (Short) kafkaConnectObject; case INT32: - return (Integer) kafkaConnectObject; case INT64: - return (Long) kafkaConnectObject; case STRING: - return (String) kafkaConnectObject; + return kafkaConnectObject; default: throw new ConversionConnectException("Unrecognized schema type: " + kafkaConnectSchemaType); } @@ -214,7 +209,7 @@ private List> convertMap(Object kafkaConnectObject, Schema kafkaConnectValueSchema = kafkaConnectSchema.valueSchema(); List> bigQueryEntryList = new ArrayList<>(); Map kafkaConnectMap = (Map) kafkaConnectObject; - for (Map.Entry kafkaConnectMapEntry : kafkaConnectMap.entrySet()) { + for (Map.Entry kafkaConnectMapEntry : kafkaConnectMap.entrySet()) { Map bigQueryEntry = new HashMap<>(); Object bigQueryKey = convertObject( kafkaConnectMapEntry.getKey(), diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/BigQuerySchemaConverter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/BigQuerySchemaConverter.java index 175d6f884..731670044 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/BigQuerySchemaConverter.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/BigQuerySchemaConverter.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.convert; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.convert; import com.google.cloud.bigquery.FieldList; import com.google.cloud.bigquery.LegacySQLTypeName; @@ -27,8 +28,10 @@ import com.wepay.kafka.connect.bigquery.convert.logicaltype.LogicalTypeConverter; import com.wepay.kafka.connect.bigquery.exception.ConversionConnectException; +import com.wepay.kafka.connect.bigquery.utils.FieldNameSanitizer; import org.apache.kafka.connect.data.Schema; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -81,9 +84,16 @@ public class BigQuerySchemaConverter implements SchemaConverter()); + List fields = kafkaConnectSchema.fields().stream() .flatMap(kafkaConnectField -> convertField(kafkaConnectField.schema(), kafkaConnectField.name()) @@ -114,10 +127,43 @@ public com.google.cloud.bigquery.Schema convertSchema(Schema kafkaConnectSchema) return com.google.cloud.bigquery.Schema.of(fields); } + private void throwOnCycle(Schema kafkaConnectSchema, List seenSoFar) { + if (PRIMITIVE_TYPE_MAP.containsKey(kafkaConnectSchema.type())) { + return; + } + + if (seenSoFar.contains(kafkaConnectSchema)) { + throw new ConversionConnectException("Kafka Connect schema contains cycle"); + } + + seenSoFar.add(kafkaConnectSchema); + switch(kafkaConnectSchema.type()) { + case ARRAY: + throwOnCycle(kafkaConnectSchema.valueSchema(), seenSoFar); + break; + case MAP: + throwOnCycle(kafkaConnectSchema.keySchema(), seenSoFar); + throwOnCycle(kafkaConnectSchema.valueSchema(), seenSoFar); + break; + case STRUCT: + kafkaConnectSchema.fields().forEach(f -> throwOnCycle(f.schema(), seenSoFar)); + break; + default: + throw new ConversionConnectException( + "Unrecognized schema type: " + kafkaConnectSchema.type() + ); + } + seenSoFar.remove(seenSoFar.size() - 1); + } + private Optional convertField(Schema kafkaConnectSchema, String fieldName) { Optional result; Schema.Type kafkaConnectSchemaType = kafkaConnectSchema.type(); + if (sanitizeFieldNames) { + fieldName = FieldNameSanitizer.sanitizeName(fieldName); + } + if (LogicalConverterRegistry.isRegisteredLogicalType(kafkaConnectSchema.name())) { result = Optional.of(convertLogical(kafkaConnectSchema, fieldName)); } else if (PRIMITIVE_TYPE_MAP.containsKey(kafkaConnectSchemaType)) { diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/KafkaDataBuilder.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/KafkaDataBuilder.java index 2de1888e0..e3dba69df 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/KafkaDataBuilder.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/KafkaDataBuilder.java @@ -1,3 +1,22 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package com.wepay.kafka.connect.bigquery.convert; diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/RecordConverter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/RecordConverter.java index 0c41b21c6..9a9a38efe 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/RecordConverter.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/RecordConverter.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.convert; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.convert; import com.wepay.kafka.connect.bigquery.api.KafkaSchemaRecordType; import org.apache.kafka.connect.sink.SinkRecord; diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/SchemaConverter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/SchemaConverter.java index 985c736f8..8ca2e68b7 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/SchemaConverter.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/SchemaConverter.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.convert; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.convert; import org.apache.kafka.connect.data.Schema; diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/DebeziumLogicalConverters.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/DebeziumLogicalConverters.java index ab86b9ea5..19939bf36 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/DebeziumLogicalConverters.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/DebeziumLogicalConverters.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.convert.logicaltype; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.convert.logicaltype; import com.google.cloud.bigquery.LegacySQLTypeName; @@ -46,6 +47,7 @@ public class DebeziumLogicalConverters { LogicalConverterRegistry.register(MicroTimestamp.SCHEMA_NAME, new MicroTimestampConverter()); LogicalConverterRegistry.register(Time.SCHEMA_NAME, new TimeConverter()); LogicalConverterRegistry.register(ZonedTimestamp.SCHEMA_NAME, new ZonedTimestampConverter()); + LogicalConverterRegistry.register(Timestamp.SCHEMA_NAME, new TimestampConverter()); } private static final int MICROS_IN_SEC = 1000000; @@ -150,7 +152,7 @@ public TimeConverter() { @Override public String convert(Object kafkaConnectObject) { - java.util.Date date = new java.util.Date((Long) kafkaConnectObject); + java.util.Date date = new java.util.Date((Integer) kafkaConnectObject); return getBQTimeFormat().format(date); } } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/KafkaLogicalConverters.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/KafkaLogicalConverters.java index 9a0176b4d..6d3685ac5 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/KafkaLogicalConverters.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/KafkaLogicalConverters.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.convert.logicaltype; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,12 +17,14 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.convert.logicaltype; import com.google.cloud.bigquery.LegacySQLTypeName; import org.apache.kafka.connect.data.Date; import org.apache.kafka.connect.data.Decimal; import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.Time; import org.apache.kafka.connect.data.Timestamp; import java.math.BigDecimal; @@ -36,6 +38,7 @@ public class KafkaLogicalConverters { LogicalConverterRegistry.register(Date.LOGICAL_NAME, new DateConverter()); LogicalConverterRegistry.register(Decimal.LOGICAL_NAME, new DecimalConverter()); LogicalConverterRegistry.register(Timestamp.LOGICAL_NAME, new TimestampConverter()); + LogicalConverterRegistry.register(Time.LOGICAL_NAME, new TimeConverter()); } /** @@ -95,4 +98,24 @@ public String convert(Object kafkaConnectObject) { return getBqTimestampFormat().format((java.util.Date) kafkaConnectObject); } } + + + /** + * Class for converting Kafka time logical types to BigQuery time types. + */ + public static class TimeConverter extends LogicalTypeConverter { + /** + * Create a new TimestampConverter. + */ + public TimeConverter() { + super(Time.LOGICAL_NAME, + Schema.Type.INT32, + LegacySQLTypeName.TIME); + } + + @Override + public String convert(Object kafkaConnectObject) { + return getBqTimeFormat().format((java.util.Date) kafkaConnectObject); + } + } } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/LogicalConverterRegistry.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/LogicalConverterRegistry.java index 36757de47..b21bcf613 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/LogicalConverterRegistry.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/LogicalConverterRegistry.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.convert.logicaltype; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.convert.logicaltype; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/LogicalTypeConverter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/LogicalTypeConverter.java index 39e22167b..9adaa330b 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/LogicalTypeConverter.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/LogicalTypeConverter.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.convert.logicaltype; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.convert.logicaltype; import com.google.cloud.bigquery.LegacySQLTypeName; @@ -87,10 +88,10 @@ protected static SimpleDateFormat getBqTimestampFormat() { return bqTimestampFormat; } - protected static SimpleDateFormat getBQDatetimeFormat() { - SimpleDateFormat bqDateTimeFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS"); - bqDateTimeFormat.setTimeZone(utcTimeZone); - return bqDateTimeFormat; + protected SimpleDateFormat getBqTimeFormat() { + SimpleDateFormat bqTimestampFormat = new SimpleDateFormat("HH:mm:ss.SSS"); + bqTimestampFormat.setTimeZone(utcTimeZone); + return bqTimestampFormat; } protected static SimpleDateFormat getBQDateFormat() { diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/BigQueryConnectException.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/BigQueryConnectException.java index 40fefd7da..38561917f 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/BigQueryConnectException.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/BigQueryConnectException.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.exception; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.exception; import com.google.cloud.bigquery.BigQueryError; @@ -42,18 +43,19 @@ public BigQueryConnectException(Throwable thr) { super(thr); } - public BigQueryConnectException(Map> errors) { - super(formatInsertAllErrors(errors)); + public BigQueryConnectException(String tableInfo, Map> errors) { + super(formatInsertAllErrors(tableInfo, errors)); } - private static String formatInsertAllErrors(Map> errorsMap) { + private static String formatInsertAllErrors(String tableInfo, Map> errorsMap) { StringBuilder messageBuilder = new StringBuilder(); - messageBuilder.append("table insertion failed for the following rows:"); + messageBuilder.append(String.format("table: %s insertion failed for the following rows:", tableInfo)); for (Map.Entry> errorsEntry : errorsMap.entrySet()) { for (BigQueryError error : errorsEntry.getValue()) { messageBuilder.append(String.format( - "%n\t[row index %d]: %s: %s", + "%n\t[row index %d] (location %s, reason: %s): %s", errorsEntry.getKey(), + error.getLocation(), error.getReason(), error.getMessage() )); @@ -61,4 +63,10 @@ private static String formatInsertAllErrors(Map> error } return messageBuilder.toString(); } + + @Override + public String toString() { + return getCause() != null ? + super.toString() + "\nCaused by: " + getCause().getLocalizedMessage() : super.toString(); + } } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/ConversionConnectException.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/ConversionConnectException.java index 795ea6749..29e10bd43 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/ConversionConnectException.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/ConversionConnectException.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.exception; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.exception; import org.apache.kafka.connect.errors.ConnectException; diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/SinkConfigConnectException.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/ExpectedInterruptException.java similarity index 59% rename from kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/SinkConfigConnectException.java rename to kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/ExpectedInterruptException.java index 98a11c069..866787e81 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/SinkConfigConnectException.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/ExpectedInterruptException.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.exception; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,23 +17,13 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.exception; import org.apache.kafka.connect.errors.ConnectException; -/** - * Class for exceptions that occur while attempting to process configuration files, including both - * formatting and logical errors. - */ -public class SinkConfigConnectException extends ConnectException { - public SinkConfigConnectException(String msg) { - super(msg); - } - - public SinkConfigConnectException(String msg, Throwable thr) { - super(msg, thr); - } +public class ExpectedInterruptException extends ConnectException { - public SinkConfigConnectException(Throwable thr) { - super(thr); + public ExpectedInterruptException(String message) { + super(message); } } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/GCSConnectException.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/GCSConnectException.java index 1ed2ae885..c676b50e5 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/GCSConnectException.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/exception/GCSConnectException.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.exception; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.exception; import org.apache.kafka.connect.errors.ConnectException; diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/retrieve/IdentitySchemaRetriever.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/retrieve/IdentitySchemaRetriever.java index ccf36be70..8b1560efb 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/retrieve/IdentitySchemaRetriever.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/retrieve/IdentitySchemaRetriever.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.retrieve; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.retrieve; + import com.wepay.kafka.connect.bigquery.api.SchemaRetriever; import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.sink.SinkRecord; diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/FieldNameSanitizer.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/FieldNameSanitizer.java index 09aeb70c2..c999b08ab 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/FieldNameSanitizer.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/FieldNameSanitizer.java @@ -1,7 +1,26 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package com.wepay.kafka.connect.bigquery.utils; +import java.util.HashMap; import java.util.Map; -import java.util.stream.Collectors; public class FieldNameSanitizer { @@ -20,15 +39,17 @@ public static String sanitizeName(String name) { // letters, numbers, and underscores. // Note: a.b and a/b will have the same value after sanitization which will cause Duplicate key // Exception. + @SuppressWarnings("unchecked") public static Map replaceInvalidKeys(Map map) { - return map.entrySet().stream().collect(Collectors.toMap( - (entry) -> sanitizeName(entry.getKey()), - (entry) -> { - if (entry.getValue() instanceof Map) { - return replaceInvalidKeys((Map) entry.getValue()); - } - return entry.getValue(); - } - )); + Map result = new HashMap<>(); + map.forEach((key, value) -> { + String sanitizedKey = sanitizeName(key); + if (value instanceof Map) { + result.put(sanitizedKey, replaceInvalidKeys((Map) value)); + } else { + result.put(sanitizedKey, value); + } + }); + return result; } } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/PartitionedTableId.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/PartitionedTableId.java index 4f3e3ee48..f38254c1e 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/PartitionedTableId.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/PartitionedTableId.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.utils; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,15 +17,22 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.utils; import com.google.cloud.bigquery.TableId; import java.time.Clock; +import java.time.Instant; import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.ZoneId; import java.time.format.DateTimeFormatter; /** * A TableId with separate base table name and partition information. + * Note that this class only supports partitioning by day; even though BigQuery supports other time partitioning types + * for tables partitioned by ingestion time, it doesn't support decorator syntax (i.e., appending "$YYYYMMDD" to the + * name of a table being streamed to) for these other time partitioning types. */ public class PartitionedTableId { diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/SinkRecordConverter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/SinkRecordConverter.java index 96cd2144b..848f598b0 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/SinkRecordConverter.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/SinkRecordConverter.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.utils; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,12 +17,23 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.utils; + import com.google.cloud.bigquery.InsertAllRequest; +import com.google.cloud.bigquery.TableId; +import com.wepay.kafka.connect.bigquery.MergeQueries; import com.wepay.kafka.connect.bigquery.api.KafkaSchemaRecordType; +import com.wepay.kafka.connect.bigquery.config.BigQuerySinkTaskConfig; import com.wepay.kafka.connect.bigquery.convert.KafkaDataBuilder; import com.wepay.kafka.connect.bigquery.convert.RecordConverter; +import com.wepay.kafka.connect.bigquery.write.batch.MergeBatches; +import org.apache.kafka.common.record.TimestampType; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.sink.SinkRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.HashMap; import java.util.Map; import java.util.Optional; @@ -30,36 +41,108 @@ * A class for converting a {@link SinkRecord SinkRecord} to {@link InsertAllRequest.RowToInsert BigQuery row} */ public class SinkRecordConverter { + private static final Logger logger = LoggerFactory.getLogger(SinkRecordConverter.class); + + private final BigQuerySinkTaskConfig config; + private final MergeBatches mergeBatches; + private final MergeQueries mergeQueries; + private final RecordConverter> recordConverter; - private final boolean sanitizeFieldName; - private final Optional kafkaKeyFieldName; - private final Optional kafkaDataFieldName; - - public SinkRecordConverter(RecordConverter> recordConverter, boolean sanitizeFieldName, Optional kafkaKeyFieldName, Optional kafkaDataFieldName) { - this.recordConverter = recordConverter; - this.sanitizeFieldName = sanitizeFieldName; - this.kafkaKeyFieldName = kafkaKeyFieldName; - this.kafkaDataFieldName = kafkaDataFieldName; + private final long mergeRecordsThreshold; + private final boolean useMessageTimeDatePartitioning; + private final boolean usePartitionDecorator; + + + public SinkRecordConverter(BigQuerySinkTaskConfig config, + MergeBatches mergeBatches, MergeQueries mergeQueries) { + this.config = config; + this.mergeBatches = mergeBatches; + this.mergeQueries = mergeQueries; + + this.recordConverter = config.getRecordConverter(); + this.mergeRecordsThreshold = config.getLong(config.MERGE_RECORDS_THRESHOLD_CONFIG); + this.useMessageTimeDatePartitioning = + config.getBoolean(config.BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG); + this.usePartitionDecorator = + config.getBoolean(config.BIGQUERY_PARTITION_DECORATOR_CONFIG); + + } + + public InsertAllRequest.RowToInsert getRecordRow(SinkRecord record, TableId table) { + Map convertedRecord = config.isUpsertDeleteEnabled() + ? getUpsertDeleteRow(record, table) + : getRegularRow(record); + + Map result = config.getBoolean(config.SANITIZE_FIELD_NAME_CONFIG) + ? FieldNameSanitizer.replaceInvalidKeys(convertedRecord) + : convertedRecord; + + return InsertAllRequest.RowToInsert.of(getRowId(record), result); } - public InsertAllRequest.RowToInsert getRecordRow(SinkRecord record) { - Map convertedRecord = recordConverter.convertRecord(record, KafkaSchemaRecordType.VALUE); - if (kafkaKeyFieldName.isPresent()) { - convertedRecord.put(kafkaKeyFieldName.get(), recordConverter.convertRecord(record, KafkaSchemaRecordType.KEY)); + private Map getUpsertDeleteRow(SinkRecord record, TableId table) { + // Unconditionally allow tombstone records if delete is enabled. + Map convertedValue = config.getBoolean(config.DELETE_ENABLED_CONFIG) && record.value() == null + ? null + : recordConverter.convertRecord(record, KafkaSchemaRecordType.VALUE); + + if (convertedValue != null) { + config.getKafkaDataFieldName().ifPresent( + fieldName -> convertedValue.put(fieldName, KafkaDataBuilder.buildKafkaDataRecord(record)) + ); } - if (kafkaDataFieldName.isPresent()) { - convertedRecord.put(kafkaDataFieldName.get(), KafkaDataBuilder.buildKafkaDataRecord(record)); + + Map result = new HashMap<>(); + long totalBatchSize = mergeBatches.addToBatch(record, table, result); + if (mergeRecordsThreshold != -1 && totalBatchSize >= mergeRecordsThreshold) { + logger.debug("Triggering merge flush for table {} since the size of its current batch has " + + "exceeded the configured threshold of {}}", + table, mergeRecordsThreshold); + mergeQueries.mergeFlush(table); } - if (sanitizeFieldName) { - convertedRecord = FieldNameSanitizer.replaceInvalidKeys(convertedRecord); + + Map convertedKey = recordConverter.convertRecord(record, KafkaSchemaRecordType.KEY); + if (convertedKey == null) { + throw new ConnectException("Record keys must be non-null when upsert/delete is enabled"); + } + + result.put(MergeQueries.INTERMEDIATE_TABLE_KEY_FIELD_NAME, convertedKey); + result.put(MergeQueries.INTERMEDIATE_TABLE_VALUE_FIELD_NAME, convertedValue); + result.put(MergeQueries.INTERMEDIATE_TABLE_ITERATION_FIELD_NAME, totalBatchSize); + if (usePartitionDecorator && useMessageTimeDatePartitioning) { + if (record.timestampType() == TimestampType.NO_TIMESTAMP_TYPE) { + throw new ConnectException( + "Message has no timestamp type, cannot use message timestamp to partition."); + } + result.put(MergeQueries.INTERMEDIATE_TABLE_PARTITION_TIME_FIELD_NAME, record.timestamp()); + } else { + // Provide a value for this column even if it's not used for partitioning in the destination + // table, so that it can be used to deduplicate rows during merge flushes + result.put(MergeQueries.INTERMEDIATE_TABLE_PARTITION_TIME_FIELD_NAME, System.currentTimeMillis() / 1000); } - return InsertAllRequest.RowToInsert.of(getRowId(record), convertedRecord); + + return result; + } + + private Map getRegularRow(SinkRecord record) { + Map result = recordConverter.convertRecord(record, KafkaSchemaRecordType.VALUE); + + config.getKafkaDataFieldName().ifPresent( + fieldName -> result.put(fieldName, KafkaDataBuilder.buildKafkaDataRecord(record)) + ); + + config.getKafkaKeyFieldName().ifPresent(fieldName -> { + Map keyData = recordConverter.convertRecord(record, KafkaSchemaRecordType.KEY); + result.put(fieldName, keyData); + }); + + return result; } private String getRowId(SinkRecord record) { return String.format("%s-%d-%d", - record.topic(), - record.kafkaPartition(), - record.kafkaOffset()); + record.topic(), + record.kafkaPartition(), + record.kafkaOffset()); } } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/TableNameUtils.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/TableNameUtils.java new file mode 100644 index 000000000..6005fe627 --- /dev/null +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/TableNameUtils.java @@ -0,0 +1,37 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.utils; + +import com.google.cloud.bigquery.TableId; + +public class TableNameUtils { + + public static String table(TableId table) { + return String.format("table `%s`.`%s`", table.getDataset(), table.getTable()); + } + + public static String intTable(TableId table) { + return "intermediate " + table(table); + } + + public static String destTable(TableId table) { + return "destination " + table(table); + } +} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/Version.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/Version.java index bbcdfae38..8a6c6f4be 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/Version.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/utils/Version.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.utils; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.utils; /** * Utility class for unifying the version of a project. All other references to version number diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/CountDownRunnable.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/CountDownRunnable.java index cbdca4ea6..70edc8a1a 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/CountDownRunnable.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/CountDownRunnable.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.write.batch; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.write.batch; import org.apache.kafka.connect.errors.ConnectException; diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/GCSBatchTableWriter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/GCSBatchTableWriter.java index 45d12e1dd..dce09101e 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/GCSBatchTableWriter.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/GCSBatchTableWriter.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.write.batch; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.write.batch; import com.google.cloud.bigquery.InsertAllRequest.RowToInsert; import com.google.cloud.bigquery.TableId; @@ -47,7 +48,7 @@ public class GCSBatchTableWriter implements Runnable { private final String bucketName; private final String blobName; - private SortedMap rows; + private final SortedMap rows; private final GCSToBQWriter writer; /** @@ -90,7 +91,7 @@ public static class Builder implements TableWriterBuilder { private String blobName; private final TableId tableId; - private SortedMap rows; + private final SortedMap rows; private final SinkRecordConverter recordConverter; private final GCSToBQWriter writer; @@ -119,19 +120,12 @@ public Builder(GCSToBQWriter writer, this.writer = writer; } - public Builder setBlobName(String blobName) { - this.blobName = blobName; - return this; - } - - /** - * Adds a record to the builder. - * @param record the row to add - */ - public void addRow(SinkRecord record) { - rows.put(record, recordConverter.getRecordRow(record)); + @Override + public void addRow(SinkRecord record, TableId table) { + rows.put(record, recordConverter.getRecordRow(record, table)); } + @Override public GCSBatchTableWriter build() { return new GCSBatchTableWriter(rows, writer, tableId, bucketName, blobName); } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/KCBQThreadPoolExecutor.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/KCBQThreadPoolExecutor.java index 637ac2ea4..75daaf79b 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/KCBQThreadPoolExecutor.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/KCBQThreadPoolExecutor.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.write.batch; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,21 +17,20 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.write.batch; import com.wepay.kafka.connect.bigquery.config.BigQuerySinkTaskConfig; import com.wepay.kafka.connect.bigquery.exception.BigQueryConnectException; +import com.wepay.kafka.connect.bigquery.exception.ExpectedInterruptException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; +import java.util.Optional; import java.util.concurrent.BlockingQueue; -import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; +import java.util.concurrent.atomic.AtomicReference; /** * ThreadPoolExecutor for writing Rows to BigQuery. @@ -43,9 +42,7 @@ public class KCBQThreadPoolExecutor extends ThreadPoolExecutor { private static final Logger logger = LoggerFactory.getLogger(KCBQThreadPoolExecutor.class); - - private ConcurrentHashMap.KeySetView encounteredErrors = - ConcurrentHashMap.newKeySet(); + private final AtomicReference encounteredError = new AtomicReference<>(); /** * @param config the {@link BigQuerySinkTaskConfig} @@ -64,12 +61,11 @@ public KCBQThreadPoolExecutor(BigQuerySinkTaskConfig config, protected void afterExecute(Runnable runnable, Throwable throwable) { super.afterExecute(runnable, throwable); - if (throwable != null) { - logger.error("Task failed with {} error: {}", - throwable.getClass().getName(), - throwable.getMessage()); - logger.debug("Error Task Stacktrace:", throwable); - encounteredErrors.add(throwable); + if (throwable != null && !(throwable instanceof ExpectedInterruptException)) { + // Log at debug level since this will be shown to the user at error level by the Connect framework if it causes + // the task to fail, and will otherwise just pollute logs and potentially mislead users + logger.debug("A write thread has failed with an unrecoverable error", throwable); + encounteredError.compareAndSet(null, throwable); } } @@ -91,19 +87,18 @@ public void awaitCurrentTasks() throws InterruptedException, BigQueryConnectExce execute(new CountDownRunnable(countDownLatch)); } countDownLatch.await(); - if (encounteredErrors.size() > 0) { - String errorString = createErrorString(encounteredErrors); - encounteredErrors.clear(); - throw new BigQueryConnectException("Some write threads encountered unrecoverable errors: " - + errorString + "; See logs for more detail"); - } + maybeThrowEncounteredError(); } - private static String createErrorString(Collection errors) { - List exceptionTypeStrings = new ArrayList<>(errors.size()); - exceptionTypeStrings.addAll(errors.stream() - .map(throwable -> throwable.getClass().getName()) - .collect(Collectors.toList())); - return String.join(", ", exceptionTypeStrings); + /** + * Immediately throw an exception if any unrecoverable errors were encountered by any of the write + * tasks. + * + * @throws BigQueryConnectException if any of the tasks failed. + */ + public void maybeThrowEncounteredError() { + Optional.ofNullable(encounteredError.get()).ifPresent(t -> { + throw new BigQueryConnectException("A write thread has failed with an unrecoverable error", t); + }); } } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/MergeBatches.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/MergeBatches.java new file mode 100644 index 000000000..6a11faee0 --- /dev/null +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/MergeBatches.java @@ -0,0 +1,350 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.write.batch; + +import com.google.cloud.bigquery.InsertAllRequest; +import com.google.cloud.bigquery.TableId; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; +import com.google.common.collect.Maps; +import com.wepay.kafka.connect.bigquery.MergeQueries; +import com.wepay.kafka.connect.bigquery.exception.ExpectedInterruptException; +import com.wepay.kafka.connect.bigquery.utils.FieldNameSanitizer; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.sink.SinkRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; + +import static com.wepay.kafka.connect.bigquery.utils.TableNameUtils.intTable; + +public class MergeBatches { + private static final Logger logger = LoggerFactory.getLogger(MergeBatches.class); + private static final long STREAMING_BUFFER_AVAILABILITY_WAIT_MS = 10_000L; + + private static long streamingBufferAvailabilityWaitMs = STREAMING_BUFFER_AVAILABILITY_WAIT_MS; + + private final String intermediateTableSuffix; + private final BiMap intermediateToDestinationTables; + private final ConcurrentMap batchNumbers; + private final ConcurrentMap> batches; + private final Map offsets; + + @VisibleForTesting + public static void setStreamingBufferAvailabilityWait(long waitMs) { + streamingBufferAvailabilityWaitMs = waitMs; + } + + @VisibleForTesting + public static void resetStreamingBufferAvailabilityWait() { + streamingBufferAvailabilityWaitMs = STREAMING_BUFFER_AVAILABILITY_WAIT_MS; + } + + public MergeBatches(String intermediateTableSuffix) { + this.intermediateTableSuffix = intermediateTableSuffix; + + this.intermediateToDestinationTables = Maps.synchronizedBiMap(HashBiMap.create()); + this.batchNumbers = new ConcurrentHashMap<>(); + this.batches = new ConcurrentHashMap<>(); + this.offsets = new HashMap<>(); + } + + /** + * Get the latest safe-to-commit offsets for every topic partition that has had at least one + * record make its way to a destination table. + * @return the offsets map which can be used in + * {@link org.apache.kafka.connect.sink.SinkTask#preCommit(Map)}; never null + */ + public Map latestOffsets() { + synchronized (offsets) { + return offsets.entrySet().stream().collect(Collectors.toMap( + Map.Entry::getKey, + entry -> new OffsetAndMetadata(entry.getValue()) + )); + } + } + + /** + * @return a thread-safe map from intermediate tables to destination tables; never null + */ + public Map intermediateToDestinationTables() { + return intermediateToDestinationTables; + } + + /** + * @return a collection of all currently-in-use intermediate tables; never null + */ + public Collection intermediateTables() { + return intermediateToDestinationTables.keySet(); + } + + /** + * Get the intermediate table for a given destination table, computing a new one if necessary + * @param destinationTable the destination table to fetch an intermediate table for + * @return the {@link TableId} of the intermediate table; never null + */ + public TableId intermediateTableFor(TableId destinationTable) { + return intermediateToDestinationTables.inverse() + .computeIfAbsent(destinationTable, this::newIntermediateTable); + } + + private TableId newIntermediateTable(TableId destinationTable) { + String tableName = FieldNameSanitizer.sanitizeName( + destinationTable.getTable() + intermediateTableSuffix + ); + TableId result = TableId.of( + destinationTable.getDataset(), + tableName + ); + + batchNumbers.put(result, new AtomicInteger()); + batches.put(result, new ConcurrentHashMap<>()); + + return result; + } + + public TableId destinationTableFor(TableId intermediateTable) { + return intermediateToDestinationTables.get(intermediateTable); + } + + /** + * Find a batch number for the record, insert that number into the converted value, record the + * offset for that record, and return the total size of that batch. + * @param record the record for the batch + * @param intermediateTable the intermediate table the record will be streamed into + * @param convertedRecord the converted record that will be passed to the BigQuery client + * @return the total number of records in the batch that this record is added to + */ + public long addToBatch(SinkRecord record, TableId intermediateTable, Map convertedRecord) { + AtomicInteger batchCount = batchNumbers.get(intermediateTable); + // Synchronize here to ensure that the batch number isn't bumped in the middle of this method. + // On its own, that wouldn't be such a bad thing, but since a merge flush is supposed to + // immediately follow that bump, it might cause some trouble if we want to add this row to the + // batch but a merge flush on that batch has already started. This way, either the batch number + // is bumped before we add the row to the batch (in which case, the row is added to the fresh + // batch), or the row is added to the batch before preparation for the flush takes place and it + // is safely counted and tracked there. + synchronized (batchCount) { + int batchNumber = batchCount.get(); + convertedRecord.put(MergeQueries.INTERMEDIATE_TABLE_BATCH_NUMBER_FIELD, batchNumber); + + Batch batch = batches.get(intermediateTable).computeIfAbsent(batchNumber, n -> new Batch()); + batch.recordOffsetFor(record); + + long pendingBatchSize = batch.increment(); + logger.trace("Added record to batch {} for {}; {} rows are currently pending", + batchNumber, intTable(intermediateTable), pendingBatchSize); + return batch.total(); + } + } + + /** + * Record a successful write of a list of rows to the given intermediate table, and decrease the + * pending record counts for every applicable batch accordingly. + * @param intermediateTable the intermediate table + * @param rows the rows + */ + public void onRowWrites(TableId intermediateTable, Collection rows) { + Map rowsByBatch = rows.stream().collect(Collectors.groupingBy( + row -> (Integer) row.getContent().get(MergeQueries.INTERMEDIATE_TABLE_BATCH_NUMBER_FIELD), + Collectors.counting() + )); + + rowsByBatch.forEach((batchNumber, batchSize) -> { + Batch batch = batch(intermediateTable, batchNumber); + synchronized (batch) { + long remainder = batch.recordWrites(batchSize); + batch.notifyAll(); + logger.trace("Notified merge flush executor of successful write of {} rows " + + "for batch {} for {}; {} rows remaining", + batchSize, batchNumber, intTable(intermediateTable), remainder); + } + }); + } + + /** + * Increment the batch number for the given table, and return the old batch number. + * @param intermediateTable the table whose batch number should be incremented + * @return the batch number for the table, pre-increment + */ + public int incrementBatch(TableId intermediateTable) { + AtomicInteger batchCount = batchNumbers.get(intermediateTable); + // See addToBatch for an explanation of the synchronization here + synchronized (batchCount) { + return batchCount.getAndIncrement(); + } + } + + /** + * Prepare to merge the batch for the given table, by ensuring that all prior batches for that + * table have completed and that all rows for the batch itself have been written. + * @param intermediateTable the table for the batch + * @param batchNumber the batch number to prepare to flush + * @return whether a flush is necessary (will be false if no rows were present in the given batch) + */ + public boolean prepareToFlush(TableId intermediateTable, int batchNumber) { + final ConcurrentMap allBatchesForTable = batches.get(intermediateTable); + if (batchNumber != 0) { + final int priorBatchNumber = batchNumber - 1; + synchronized (allBatchesForTable) { + logger.debug("Ensuring batch {} is completed for {} before flushing batch {}", + priorBatchNumber, intTable(intermediateTable), batchNumber); + while (allBatchesForTable.containsKey(priorBatchNumber)) { + try { + allBatchesForTable.wait(); + } catch (InterruptedException e) { + logger.warn("Interrupted while waiting for batch {} to complete for {}", + batchNumber, intTable(intermediateTable)); + throw new ExpectedInterruptException(String.format( + "Interrupted while waiting for batch %d to complete for %s", + batchNumber, intTable(intermediateTable) + )); + } + } + } + } else { + logger.debug("Flushing first batch for {}", intTable(intermediateTable)); + } + + final Batch currentBatch = allBatchesForTable.get(batchNumber); + if (currentBatch == null) { + logger.trace("No rows to write in batch {} for {}", batchNumber, intTable(intermediateTable)); + return false; + } + + synchronized (currentBatch) { + logger.debug("{} rows currently remaining for batch {} for {}", + currentBatch.pending(), batchNumber, intTable(intermediateTable)); + while (currentBatch.pending() != 0) { + logger.trace("Waiting for all rows for batch {} from {} to be written before flushing; {} remaining", + batchNumber, intTable(intermediateTable), currentBatch.pending()); + try { + currentBatch.wait(); + } catch (InterruptedException e) { + logger.warn("Interrupted while waiting for all rows for batch {} from {} to be written", + batchNumber, intTable(intermediateTable)); + throw new ExpectedInterruptException(String.format( + "Interrupted while waiting for all rows for batch %d from %s to be written", + batchNumber, intTable(intermediateTable) + )); + } + } + } + + try { + logger.trace( + "Waiting {}ms before running merge query on batch {} from {} " + + "in order to ensure that all rows are available in the streaming buffer", + streamingBufferAvailabilityWaitMs, batchNumber, intTable(intermediateTable)); + Thread.sleep(streamingBufferAvailabilityWaitMs); + } catch (InterruptedException e) { + logger.warn("Interrupted while waiting before merge flushing batch {} for {}", + batchNumber, intTable(intermediateTable)); + throw new ExpectedInterruptException(String.format( + "Interrupted while waiting before merge flushing batch %d for %s", + batchNumber, intTable(intermediateTable) + )); + } + return true; + } + + /** + * Record a successful merge flush of all of the rows for the given batch in the intermediate + * table, alert any waiting merge flushes that are predicated on the completion of this merge + * flush, and marke the offsets for all of those rows as safe to commit. + * @param intermediateTable the source of the merge flush + * @param batchNumber the batch for the merge flush + */ + public void recordSuccessfulFlush(TableId intermediateTable, int batchNumber) { + logger.trace("Successfully merge flushed batch {} for {}", + batchNumber, intTable(intermediateTable)); + final ConcurrentMap allBatchesForTable = batches.get(intermediateTable); + Batch batch = allBatchesForTable.remove(batchNumber); + + synchronized (allBatchesForTable) { + allBatchesForTable.notifyAll(); + } + + synchronized (offsets) { + offsets.putAll(batch.offsets()); + } + } + + private Batch batch(TableId intermediateTable, int batchNumber) { + return batches.get(intermediateTable).get(batchNumber); + } + + private static class Batch { + private final AtomicLong pending; + private final AtomicLong total; + private final Map offsets; + + public Batch() { + this.total = new AtomicLong(); + this.pending = new AtomicLong(); + this.offsets = new HashMap<>(); + } + + public long pending() { + return pending.get(); + } + + public long total() { + return total.get(); + } + + public Map offsets() { + return offsets; + } + + public void recordOffsetFor(SinkRecord record) { + offsets.put( + new TopicPartition(record.topic(), record.kafkaPartition()), + // Use the offset of the record plus one here since that'll be the offset that we'll + // resume at if/when this record is the last-committed record and then the task is + // restarted + record.kafkaOffset() + 1); + } + + /** + * Increment the total and pending number of records, and return the number of pending records + * @return the number of pending records for this batch + */ + public long increment() { + total.incrementAndGet(); + return pending.incrementAndGet(); + } + + public long recordWrites(long numRows) { + return pending.addAndGet(-numRows); + } + } +} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/TableWriter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/TableWriter.java index dee011142..104488c4e 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/TableWriter.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/TableWriter.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.write.batch; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,25 +17,32 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.write.batch; import com.google.cloud.bigquery.BigQueryException; import com.google.cloud.bigquery.InsertAllRequest.RowToInsert; +import com.google.cloud.bigquery.TableId; import com.wepay.kafka.connect.bigquery.utils.SinkRecordConverter; +import com.wepay.kafka.connect.bigquery.exception.BigQueryConnectException; +import com.wepay.kafka.connect.bigquery.exception.ExpectedInterruptException; import com.wepay.kafka.connect.bigquery.utils.PartitionedTableId; +import com.wepay.kafka.connect.bigquery.write.row.BigQueryErrorResponses; import com.wepay.kafka.connect.bigquery.write.row.BigQueryWriter; -import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.sink.SinkRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.Collection; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; +import java.util.Objects; +import java.util.function.Consumer; /** * Simple Table Writer that attempts to write all the rows it is given at once. @@ -44,24 +51,26 @@ public class TableWriter implements Runnable { private static final Logger logger = LoggerFactory.getLogger(TableWriter.class); - private static final int BAD_REQUEST_CODE = 400; - private static final String INVALID_REASON = "invalid"; - private final BigQueryWriter writer; private final PartitionedTableId table; private final SortedMap rows; + private final Consumer> onFinish; /** * @param writer the {@link BigQueryWriter} to use. * @param table the BigQuery table to write to. * @param rows the rows to write. + * @param onFinish a callback to invoke after all rows have been written successfully, which is + * called with all the rows written by the writer */ public TableWriter(BigQueryWriter writer, PartitionedTableId table, - SortedMap rows) { + SortedMap rows, + Consumer> onFinish) { this.writer = writer; this.table = table; this.rows = rows; + this.onFinish = onFinish; } @Override @@ -85,15 +94,21 @@ public void run() { currentIndex += currentBatchSize; successCount++; } catch (BigQueryException err) { - logger.warn("Could not write batch of size {} to BigQuery.", currentBatchList.size(), err); + logger.warn( + "Could not write batch of size {} to BigQuery. " + + "Error code: {}, underlying error (if present): {}", + currentBatchList.size(), err.getCode(), err.getError(), err); if (isBatchSizeError(err)) { failureCount++; - currentBatchSize = getNewBatchSize(currentBatchSize); + currentBatchSize = getNewBatchSize(currentBatchSize, err); + } else { + // Throw exception on write errors such as 403. + throw new BigQueryConnectException("Failed to write to table", err); } } } } catch (InterruptedException err) { - throw new ConnectException("Thread interrupted while writing to BigQuery.", err); + throw new ExpectedInterruptException("Thread interrupted while writing to BigQuery."); } // Common case is 1 successful call and 0 failed calls: @@ -106,12 +121,29 @@ public void run() { logger.debug(logMessage, rows.size(), successCount, failureCount); } + onFinish.accept(rows.values()); } - private static int getNewBatchSize(int currentBatchSize) { + private static int getNewBatchSize(int currentBatchSize, Throwable err) { if (currentBatchSize == 1) { - // todo correct exception type? - throw new ConnectException("Attempted to reduce batch size below 1."); + logger.error("Attempted to reduce batch size below 1"); + throw new BigQueryConnectException( + "Failed to write to BigQuery even after reducing batch size to 1 row at a time. " + + "This can indicate an error in the connector's logic for classifying BigQuery errors, as non-retriable" + + "errors may be being treated as retriable." + + "If that appears to be the case, please report the issue to the project's maintainers and include the " + + "complete stack trace for this error as it appears in the logs. " + + "Alternatively, there may be a record that the connector has read from Kafka that is too large to " + + "write to BigQuery using the streaming insert API, which cannot be addressed with a change to the " + + "connector and will need to be handled externally by optionally writing the record to BigQuery using " + + "another means and then reconfiguring the connector to skip the record. " + + "Finally, streaming insert quotas for BigQuery may be causing insertion failures for the connector; " + + "in that case, please ensure that quotas for maximum rows per second, maximum bytes per second, etc. " + + "are being respected before restarting the connector. " + + "The cause of this exception is the error encountered from BigQuery after the last attempt to write a " + + "batch was made.", + err + ); } // round batch size up so we don't end up with a dangling 1 row at the end. return (int) Math.ceil(currentBatchSize / 2.0); @@ -123,26 +155,15 @@ private static int getNewBatchSize(int currentBatchSize) { * size, or false otherwise. */ private static boolean isBatchSizeError(BigQueryException exception) { - if (exception.getCode() == BAD_REQUEST_CODE - && exception.getError() == null - && exception.getReason() == null) { - /* - * 400 with no error or reason represents a request that is more than 10MB. This is not - * documented but is referenced slightly under "Error codes" here: - * https://cloud.google.com/bigquery/quota-policy - * (by decreasing the batch size we can eventually expect to end up with a request under 10MB) - */ - return true; - } else if (exception.getCode() == BAD_REQUEST_CODE - && INVALID_REASON.equals(exception.getReason())) { - /* - * this is the error that the documentation claims google will return if a request exceeds - * 10MB. if this actually ever happens... - * todo distinguish this from other invalids (like invalid table schema). - */ - return true; - } - return false; + /* + * 400 with no error or reason represents a request that is more than 10MB. This is not + * documented but is referenced slightly under "Error codes" here: + * https://cloud.google.com/bigquery/quota-policy + * (by decreasing the batch size we can eventually expect to end up with a request under 10MB) + */ + return BigQueryErrorResponses.isUnspecifiedBadRequestError(exception) + || BigQueryErrorResponses.isRequestTooLargeError(exception) + || BigQueryErrorResponses.isTooManyRowsError(exception); } @@ -152,6 +173,7 @@ public static class Builder implements TableWriterBuilder { private SortedMap rows; private SinkRecordConverter recordConverter; + private Consumer> onFinish; /** * @param writer the BigQueryWriter to use @@ -165,22 +187,31 @@ public Builder(BigQueryWriter writer, PartitionedTableId table, SinkRecordConver this.rows = new TreeMap<>(Comparator.comparing(SinkRecord::kafkaPartition) .thenComparing(SinkRecord::kafkaOffset)); this.recordConverter = recordConverter; + + this.onFinish = null; } - /** - * Add a record to the builder. - * @param record the row to add - */ - public void addRow(SinkRecord record) { - rows.put(record, recordConverter.getRecordRow(record)); + @Override + public void addRow(SinkRecord record, TableId table) { + rows.put(record, recordConverter.getRecordRow(record, table)); } /** - * Create a {@link TableWriter} from this builder. - * @return a TableWriter containing the given writer, table, topic, and all added rows. + * Specify a callback to be invoked after all rows have been written. The callback will be + * invoked with the full list of rows written by this table writer. + * @param onFinish the callback to invoke; may not be null + * @throws IllegalStateException if invoked more than once on a single builder instance */ + public void onFinish(Consumer> onFinish) { + if (this.onFinish != null) { + throw new IllegalStateException("Cannot overwrite existing finish callback"); + } + this.onFinish = Objects.requireNonNull(onFinish, "Finish callback cannot be null"); + } + + @Override public TableWriter build() { - return new TableWriter(writer, table, rows); + return new TableWriter(writer, table, rows, onFinish != null ? onFinish : n -> { }); } } } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/TableWriterBuilder.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/TableWriterBuilder.java index 556ae98e3..506e78346 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/TableWriterBuilder.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/batch/TableWriterBuilder.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.write.batch; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,9 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.write.batch; +import com.google.cloud.bigquery.TableId; import org.apache.kafka.connect.sink.SinkRecord; /** @@ -28,8 +30,9 @@ public interface TableWriterBuilder { /** * Add a record to the builder. * @param sinkRecord the row to add. + * @param table the table the row will be written to. */ - void addRow(SinkRecord sinkRecord); + void addRow(SinkRecord sinkRecord, TableId table); /** * Create a {@link TableWriter} from this builder. diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/AdaptiveBigQueryWriter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/AdaptiveBigQueryWriter.java index 10c30babc..5232ccc32 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/AdaptiveBigQueryWriter.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/AdaptiveBigQueryWriter.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.write.row; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.write.row; import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.TableId; @@ -28,16 +29,17 @@ import com.wepay.kafka.connect.bigquery.SchemaManager; import com.wepay.kafka.connect.bigquery.exception.BigQueryConnectException; +import com.wepay.kafka.connect.bigquery.exception.ExpectedInterruptException; import com.wepay.kafka.connect.bigquery.utils.PartitionedTableId; import org.apache.kafka.connect.sink.SinkRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.SortedMap; /** @@ -47,8 +49,8 @@ public class AdaptiveBigQueryWriter extends BigQueryWriter { private static final Logger logger = LoggerFactory.getLogger(AdaptiveBigQueryWriter.class); // The maximum number of retries we will attempt to write rows after creating a table or updating a BQ table schema. - private static final int RETRY_LIMIT = 5; - // Wait for about 30s between each retry since both creating table and updating schema take up to 2~3 minutes to take effect. + private static final int RETRY_LIMIT = 30; + // Wait for about 30s between each retry to avoid hammering BigQuery with requests private static final int RETRY_WAIT_TIME = 30000; private final BigQuery bigQuery; @@ -60,6 +62,7 @@ public class AdaptiveBigQueryWriter extends BigQueryWriter { * @param schemaManager Used to update BigQuery tables. * @param retry How many retries to make in the event of a 500/503 error. * @param retryWait How long to wait in between retries. + * @param autoCreateTables Whether tables should be automatically created */ public AdaptiveBigQueryWriter(BigQuery bigQuery, SchemaManager schemaManager, @@ -72,18 +75,6 @@ public AdaptiveBigQueryWriter(BigQuery bigQuery, this.autoCreateTables = autoCreateTables; } - private boolean isTableMissingSchema(BigQueryException exception) { - // If a table is missing a schema, it will raise a BigQueryException that the input is invalid - // For more information about BigQueryExceptions, see: https://cloud.google.com/bigquery/troubleshooting-errors - return exception.getReason() != null && exception.getReason().equalsIgnoreCase("invalid"); - } - - private boolean isTableNotExistedException(BigQueryException exception) { - // If a table does not exist, it will raise a BigQueryException that the input is notFound - // Referring to Google Cloud Error Codes Doc: https://cloud.google.com/bigquery/docs/error-messages?hl=en - return exception.getCode() == 404; - } - /** * Sends the request to BigQuery, then checks the response to see if any errors have occurred. If * any have, and all errors can be blamed upon invalid columns in the rows sent, attempts to @@ -103,14 +94,14 @@ public Map> performWriteRequest( // Should only perform one schema update attempt. if (writeResponse.hasErrors() && onlyContainsInvalidSchemaErrors(writeResponse.getInsertErrors())) { - attemptSchemaUpdate(tableId, rows.keySet()); + attemptSchemaUpdate(tableId, new ArrayList<>(rows.keySet())); } } catch (BigQueryException exception) { // Should only perform one table creation attempt. - if (isTableNotExistedException(exception) && autoCreateTables && bigQuery.getTable(tableId.getBaseTableId()) == null) { - attemptTableCreate(tableId.getBaseTableId(), rows.keySet()); - } else if (isTableMissingSchema(exception)) { - attemptSchemaUpdate(tableId, rows.keySet()); + if (BigQueryErrorResponses.isNonExistentTableError(exception) && autoCreateTables) { + attemptTableCreate(tableId.getBaseTableId(), new ArrayList<>(rows.keySet())); + } else if (BigQueryErrorResponses.isTableMissingSchemaError(exception)) { + attemptSchemaUpdate(tableId, new ArrayList<>(rows.keySet())); } else { throw exception; } @@ -128,7 +119,14 @@ && onlyContainsInvalidSchemaErrors(writeResponse.getInsertErrors())) { logger.debug("re-attempting insertion"); writeResponse = bigQuery.insertAll(request); } catch (BigQueryException exception) { - // no-op, we want to keep retrying the insert + if ((BigQueryErrorResponses.isNonExistentTableError(exception) && autoCreateTables) + || BigQueryErrorResponses.isTableMissingSchemaError(exception) + ) { + // no-op, we want to keep retrying the insert + logger.debug("insertion failed", exception); + } else { + throw exception; + } } } else { return writeResponse.getInsertErrors(); @@ -142,14 +140,14 @@ && onlyContainsInvalidSchemaErrors(writeResponse.getInsertErrors())) { try { Thread.sleep(RETRY_WAIT_TIME); } catch (InterruptedException e) { - // no-op, we want to keep retrying the insert + throw new ExpectedInterruptException("Interrupted while waiting to retry write"); } } logger.debug("table insertion completed successfully"); return new HashMap<>(); } - private void attemptSchemaUpdate(PartitionedTableId tableId, Set records) { + protected void attemptSchemaUpdate(PartitionedTableId tableId, List records) { try { schemaManager.updateSchema(tableId.getBaseTableId(), records); } catch (BigQueryException exception) { @@ -158,9 +156,8 @@ private void attemptSchemaUpdate(PartitionedTableId tableId, Set rec } } - private void attemptTableCreate(TableId tableId, Set records) { + protected void attemptTableCreate(TableId tableId, List records) { try { - logger.info("Table {} does not exist, auto-creating table", tableId); schemaManager.createTable(tableId, records); } catch (BigQueryException exception) { throw new BigQueryConnectException( @@ -178,12 +175,13 @@ private void attemptTableCreate(TableId tableId, Set records) { * This is why we can't have nice things, Google. */ private boolean onlyContainsInvalidSchemaErrors(Map> errors) { + logger.trace("write response contained errors: \n{}", errors); boolean invalidSchemaError = false; for (List errorList : errors.values()) { for (BigQueryError error : errorList) { - if (error.getReason().equals("invalid") && (error.getMessage().contains("no such field") || error.getMessage().contains("Missing required field"))) { + if (BigQueryErrorResponses.isMissingRequiredFieldError(error) || BigQueryErrorResponses.isUnrecognizedFieldError(error)) { invalidSchemaError = true; - } else if (!error.getReason().equals("stopped")) { + } else if (!BigQueryErrorResponses.isStoppedError(error)) { /* if some rows are in the old schema format, and others aren't, the old schema * formatted rows will show up as error: stopped. We still want to continue if this is * the case, because these errors don't represent a unique error if there are also diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/BigQueryErrorResponses.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/BigQueryErrorResponses.java new file mode 100644 index 000000000..fc1635401 --- /dev/null +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/BigQueryErrorResponses.java @@ -0,0 +1,144 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.write.row; + +import com.google.cloud.bigquery.BigQueryError; +import com.google.cloud.bigquery.BigQueryException; + +import java.io.IOException; +import java.util.Optional; +import java.util.function.Function; + +/** + * Handles the logic for classifying BigQuery error responses and determining things like whether they come from an + * invalid schema error, a backend error, etc. This can be used to determine whether a table needs to be created before + * retrying an insert or if a temporary server-side error requires us to retry a request, for example. + */ +public class BigQueryErrorResponses { + + private static final int BAD_REQUEST_CODE = 400; + private static final int FORBIDDEN_CODE = 403; + private static final int NOT_FOUND_CODE = 404; + private static final int INTERNAL_SERVICE_ERROR_CODE = 500; + private static final int BAD_GATEWAY_CODE = 502; + private static final int SERVICE_UNAVAILABLE_CODE = 503; + + private static final String BAD_REQUEST_REASON = "badRequest"; + private static final String INVALID_REASON = "invalid"; + private static final String NOT_FOUND_REASON = "notFound"; + private static final String QUOTA_EXCEEDED_REASON = "quotaExceeded"; + private static final String RATE_LIMIT_EXCEEDED_REASON = "rateLimitExceeded"; + private static final String STOPPED_REASON = "stopped"; + + + public static boolean isNonExistentTableError(BigQueryException exception) { + String message = message(exception.getError()); + // If a table does not exist, it will raise a BigQueryException that the input is notFound + // Referring to Google Cloud Error Codes Doc: https://cloud.google.com/bigquery/docs/error-messages?hl=en + return NOT_FOUND_CODE == exception.getCode() + && NOT_FOUND_REASON.equals(exception.getReason()) + && (message.startsWith("Not found: Table ") || message.contains("Table is deleted: ")); + } + + public static boolean isTableMissingSchemaError(BigQueryException exception) { + // If a table is missing a schema, it will raise a BigQueryException that the input is invalid + // For more information about BigQueryExceptions, see: https://cloud.google.com/bigquery/troubleshooting-errors + return BAD_REQUEST_CODE == exception.getCode() + && INVALID_REASON.equals(exception.getReason()) + && message(exception.getError()).equals("The destination table has no schema."); + } + + public static boolean isBackendError(BigQueryException exception) { + // backend error: https://cloud.google.com/bigquery/troubleshooting-errors + // for BAD_GATEWAY: https://cloud.google.com/storage/docs/json_api/v1/status-codes + // TODO: possibly this page is inaccurate for bigquery, but the message we are getting + // suggest it's an internal backend error and we should retry, so lets take that at face + // value + return INTERNAL_SERVICE_ERROR_CODE == exception.getCode() + || BAD_GATEWAY_CODE == exception.getCode() + || SERVICE_UNAVAILABLE_CODE == exception.getCode(); + } + + public static boolean isUnspecifiedBadRequestError(BigQueryException exception) { + return BAD_REQUEST_CODE == exception.getCode() + && exception.getError() == null + && exception.getReason() == null; + } + + public static boolean isQuotaExceededError(BigQueryException exception) { + return FORBIDDEN_CODE == exception.getCode() + // TODO: May be able to use exception.getReason() instead of (indirectly) exception.getError().getReason() + // Haven't been able to test yet though, so keeping as-is to avoid breaking anything + && QUOTA_EXCEEDED_REASON.equals(reason(exception.getError())); + } + + public static boolean isRateLimitExceededError(BigQueryException exception) { + return FORBIDDEN_CODE == exception.getCode() + // TODO: May be able to use exception.getReason() instead of (indirectly) exception.getError().getReason() + // Haven't been able to test yet though, so keeping as-is to avoid breaking anything + && RATE_LIMIT_EXCEEDED_REASON.equals(reason(exception.getError())); + } + + public static boolean isRequestTooLargeError(BigQueryException exception) { + return BAD_REQUEST_CODE == exception.getCode() + && BAD_REQUEST_REASON.equals(exception.getReason()) + && message(exception.getError()).startsWith("Request payload size exceeds the limit: "); + } + + public static boolean isTooManyRowsError(BigQueryException exception) { + return BAD_REQUEST_CODE == exception.getCode() + && INVALID_REASON.equalsIgnoreCase(exception.getReason()) + && message(exception.getError()).startsWith("too many rows present in the request"); + } + + public static boolean isIOError(BigQueryException error) { + return BigQueryException.UNKNOWN_CODE == error.getCode() + && error.getCause() instanceof IOException; + } + + public static boolean isUnrecognizedFieldError(BigQueryError error) { + return INVALID_REASON.equals(reason(error)) + && message(error).startsWith("no such field: "); + } + + public static boolean isMissingRequiredFieldError(BigQueryError error) { + return INVALID_REASON.equals(reason(error)) + && message(error).startsWith("Missing required field"); + } + + public static boolean isStoppedError(BigQueryError error) { + return STOPPED_REASON.equals(reason(error)) + && message(error).equals(""); + } + + private static String reason(BigQueryError error) { + return extractFromError(error, BigQueryError::getReason); + } + + private static String message(BigQueryError error) { + return extractFromError(error, BigQueryError::getMessage); + } + + private static String extractFromError(BigQueryError error, Function extraction) { + return Optional.ofNullable(error) + .map(extraction) + .orElse(""); + } +} diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/BigQueryWriter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/BigQueryWriter.java index 2a11653e5..ac90b7866 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/BigQueryWriter.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/BigQueryWriter.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.write.row; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.write.row; import com.google.cloud.bigquery.BigQueryError; import com.google.cloud.bigquery.BigQueryException; @@ -42,21 +43,13 @@ */ public abstract class BigQueryWriter { - private static final int FORBIDDEN = 403; - private static final int INTERNAL_SERVICE_ERROR = 500; - private static final int BAD_GATEWAY = 502; - private static final int SERVICE_UNAVAILABLE = 503; - private static final String QUOTA_EXCEEDED_REASON = "quotaExceeded"; - private static final String RATE_LIMIT_EXCEEDED_REASON = "rateLimitExceeded"; - private static final int WAIT_MAX_JITTER = 1000; private static final Logger logger = LoggerFactory.getLogger(BigQueryWriter.class); - private static final Random random = new Random(); - - private int retries; - private long retryWaitMs; + private final int retries; + private final long retryWaitMs; + private final Random random; /** * @param retries the number of times to retry a request if BQ returns an internal service error @@ -67,6 +60,8 @@ public abstract class BigQueryWriter { public BigQueryWriter(int retries, long retryWaitMs) { this.retries = retries; this.retryWaitMs = retryWaitMs; + + this.random = new Random(); } /** @@ -123,36 +118,26 @@ public void writeRows(PartitionedTableId table, rows.size() - failedRowsMap.size(), failedRowsMap.size()); // update insert rows and retry in case of partial failure rows = getFailedRows(rows, failedRowsMap.keySet(), table); - mostRecentException = new BigQueryConnectException(failedRowsMap); + mostRecentException = new BigQueryConnectException(table.toString(), failedRowsMap); retryCount++; } else { // throw an exception in case of complete failure - throw new BigQueryConnectException(failedRowsMap); + throw new BigQueryConnectException(table.toString(), failedRowsMap); } } catch (BigQueryException err) { mostRecentException = err; - if (err.getCode() == INTERNAL_SERVICE_ERROR - || err.getCode() == SERVICE_UNAVAILABLE - || err.getCode() == BAD_GATEWAY) { - // backend error: https://cloud.google.com/bigquery/troubleshooting-errors - /* for BAD_GATEWAY: https://cloud.google.com/storage/docs/json_api/v1/status-codes - todo possibly this page is inaccurate for bigquery, but the message we are getting - suggest it's an internal backend error and we should retry, so lets take that at face - value. */ + if (BigQueryErrorResponses.isBackendError(err)) { logger.warn("BQ backend error: {}, attempting retry", err.getCode()); retryCount++; - } else if (err.getCode() == FORBIDDEN - && err.getError() != null - && QUOTA_EXCEEDED_REASON.equals(err.getReason())) { - // quota exceeded error + } else if (BigQueryErrorResponses.isQuotaExceededError(err)) { logger.warn("Quota exceeded for table {}, attempting retry", table); retryCount++; - } else if (err.getCode() == FORBIDDEN - && err.getError() != null - && RATE_LIMIT_EXCEEDED_REASON.equals(err.getReason())) { - // rate limit exceeded error + } else if (BigQueryErrorResponses.isRateLimitExceededError(err)) { logger.warn("Rate limit exceeded for table {}, attempting retry", table); retryCount++; + } else if (BigQueryErrorResponses.isIOError(err)){ + logger.warn("IO Exception: {}, attempting retry", err.getCause().getMessage()); + retryCount++; } else { throw err; } diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/GCSToBQWriter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/GCSToBQWriter.java index 10a37b660..2e8b839f0 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/GCSToBQWriter.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/GCSToBQWriter.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.write.row; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.write.row; import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.BigQueryException; @@ -40,11 +41,12 @@ import org.slf4j.LoggerFactory; import java.io.UnsupportedEncodingException; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.List; import java.util.Map; import java.util.Random; -import java.util.Set; import java.util.SortedMap; /** @@ -118,7 +120,7 @@ public void writeRows(SortedMap rows, // Check if the table specified exists // This error shouldn't be thrown. All tables should be created by the connector at startup if (autoCreateTables && bigQuery.getTable(tableId) == null) { - attemptTableCreate(tableId, rows.keySet()); + attemptTableCreate(tableId, new ArrayList<>(rows.keySet())); } int attemptCount = 0; @@ -198,7 +200,7 @@ private void waitRandomTime() throws InterruptedException { Thread.sleep(retryWaitMs + random.nextInt(WAIT_MAX_JITTER)); } - private void attemptTableCreate(TableId tableId, Set records) { + private void attemptTableCreate(TableId tableId, List records) { try { logger.info("Table {} does not exist, auto-creating table ", tableId); schemaManager.createTable(tableId, records); diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/SimpleBigQueryWriter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/SimpleBigQueryWriter.java index 1a5757b7a..d7f084a98 100644 --- a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/SimpleBigQueryWriter.java +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/SimpleBigQueryWriter.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.write.row; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.write.row; import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.BigQueryError; diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/UpsertDeleteBigQueryWriter.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/UpsertDeleteBigQueryWriter.java new file mode 100644 index 000000000..d713130ac --- /dev/null +++ b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/UpsertDeleteBigQueryWriter.java @@ -0,0 +1,94 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.write.row; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.BigQueryException; +import com.google.cloud.bigquery.TableId; +import com.wepay.kafka.connect.bigquery.SchemaManager; +import com.wepay.kafka.connect.bigquery.exception.BigQueryConnectException; +import com.wepay.kafka.connect.bigquery.utils.PartitionedTableId; +import org.apache.kafka.connect.sink.SinkRecord; + +import java.util.List; +import java.util.Map; + +public class UpsertDeleteBigQueryWriter extends AdaptiveBigQueryWriter { + + private final SchemaManager schemaManager; + private final boolean autoCreateTables; + private final Map intermediateToDestinationTables; + + /** + * @param bigQuery Used to send write requests to BigQuery. + * @param schemaManager Used to update BigQuery tables. + * @param retry How many retries to make in the event of a 500/503 error. + * @param retryWait How long to wait in between retries. + * @param autoCreateTables Whether destination tables should be automatically created + * @param intermediateToDestinationTables A mapping used to determine the destination table for + * given intermediate tables; used for create/update + * operations in order to propagate them to the destination + * table + */ + public UpsertDeleteBigQueryWriter(BigQuery bigQuery, + SchemaManager schemaManager, + int retry, + long retryWait, + boolean autoCreateTables, + Map intermediateToDestinationTables) { + // Hardcode autoCreateTables to true in the superclass so that intermediate tables will be + // automatically created + // The super class will handle all of the logic for writing to, creating, and updating + // intermediate tables; this class will handle logic for creating/updating the destination table + super(bigQuery, schemaManager.forIntermediateTables(), retry, retryWait, true); + this.schemaManager = schemaManager; + this.autoCreateTables = autoCreateTables; + this.intermediateToDestinationTables = intermediateToDestinationTables; + } + + @Override + protected void attemptSchemaUpdate(PartitionedTableId tableId, List records) { + // Update the intermediate table here... + super.attemptSchemaUpdate(tableId, records); + try { + // ... and update the destination table here + schemaManager.updateSchema(intermediateToDestinationTables.get(tableId.getBaseTableId()), records); + } catch (BigQueryException exception) { + throw new BigQueryConnectException( + "Failed to update destination table schema for: " + tableId.getBaseTableId(), exception); + } + } + + @Override + protected void attemptTableCreate(TableId tableId, List records) { + // Create the intermediate table here... + super.attemptTableCreate(tableId, records); + if (autoCreateTables) { + try { + // ... and create or update the destination table here, if it doesn't already exist and auto + // table creation is enabled + schemaManager.createOrUpdateTable(intermediateToDestinationTables.get(tableId), records); + } catch (BigQueryException exception) { + throw new BigQueryConnectException( + "Failed to create table " + tableId, exception); + } + } + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/BigQuerySinkConnectorTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/BigQuerySinkConnectorTest.java index 1da4bafe1..c1757bd95 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/BigQuerySinkConnectorTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/BigQuerySinkConnectorTest.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,33 +17,11 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNotSame; - -import static org.mockito.Matchers.any; - -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.never; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; - -import com.google.cloud.bigquery.BigQuery; -import com.google.cloud.bigquery.Table; -import com.google.cloud.bigquery.TableId; - -import com.wepay.kafka.connect.bigquery.api.KafkaSchemaRecordType; import com.wepay.kafka.connect.bigquery.api.SchemaRetriever; - -import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; - -import com.wepay.kafka.connect.bigquery.exception.BigQueryConnectException; -import com.wepay.kafka.connect.bigquery.exception.SinkConfigConnectException; -import org.apache.kafka.common.config.ConfigException; - +import com.wepay.kafka.connect.bigquery.config.BigQuerySinkTaskConfig; import org.apache.kafka.connect.data.Schema; - import org.apache.kafka.connect.sink.SinkRecord; import org.junit.BeforeClass; import org.junit.Test; @@ -52,8 +30,12 @@ import java.util.List; import java.util.Map; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNotSame; + public class BigQuerySinkConnectorTest { - private static SinkConnectorPropertiesFactory propertiesFactory; + private static SinkPropertiesFactory propertiesFactory; // Would just use Mockito, but can't provide the name of an anonymous class to the config file public static class MockSchemaRetriever implements SchemaRetriever { @@ -75,7 +57,7 @@ public Schema retrieveValueSchema(SinkRecord record){ @BeforeClass public static void initializePropertiesFactory() { - propertiesFactory = new SinkConnectorPropertiesFactory(); + propertiesFactory = new SinkPropertiesFactory(); } @Test @@ -87,13 +69,7 @@ public void testTaskClass() { public void testTaskConfigs() { Map properties = propertiesFactory.getProperties(); - Table fakeTable = mock(Table.class); - - BigQuery bigQuery = mock(BigQuery.class); - when(bigQuery.getTable(any(TableId.class))).thenReturn(fakeTable); - - SchemaManager schemaManager = mock(SchemaManager.class); - BigQuerySinkConnector testConnector = new BigQuerySinkConnector(bigQuery, schemaManager); + BigQuerySinkConnector testConnector = new BigQuerySinkConnector(); testConnector.start(properties); @@ -102,6 +78,7 @@ public void testTaskConfigs() { List> taskConfigs = testConnector.taskConfigs(i); assertEquals(i, taskConfigs.size()); for (int j = 0; j < i; j++) { + expectedProperties.put(BigQuerySinkTaskConfig.TASK_ID_CONFIG, Integer.toString(j)); assertEquals( "Connector properties should match task configs", expectedProperties, @@ -127,20 +104,7 @@ public void testTaskConfigs() { @Test public void testConfig() { - assertEquals(BigQuerySinkConfig.getConfig(), new BigQuerySinkConnector().config()); - } - - // Make sure that a config exception is properly translated into a SinkConfigConnectException - @Test(expected = SinkConfigConnectException.class) - public void testConfigException() { - try { - Map badProperties = propertiesFactory.getProperties(); - badProperties.remove(BigQuerySinkConfig.TOPICS_CONFIG); - BigQuerySinkConfig.validate(badProperties); - new BigQuerySinkConnector().start(badProperties); - } catch (ConfigException e) { - throw new SinkConfigConnectException(e); - } + assertNotNull(new BigQuerySinkConnector().config()); } @Test diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/BigQuerySinkTaskTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/BigQuerySinkTaskTest.java index 358a3c8ab..7c1faa609 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/BigQuerySinkTaskTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/BigQuerySinkTaskTest.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,10 +17,14 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.CONNECTOR_RUNTIME_PROVIDER_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.CONNECTOR_RUNTIME_PROVIDER_DEFAULT; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.fail; +import static org.junit.Assert.assertThrows; +import static org.junit.Assert.assertTrue; import static org.mockito.Matchers.any; import static org.mockito.Matchers.anyObject; import static org.mockito.Mockito.mock; @@ -31,17 +35,23 @@ import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.BigQueryError; import com.google.cloud.bigquery.BigQueryException; +import com.google.cloud.bigquery.Field; import com.google.cloud.bigquery.InsertAllRequest; import com.google.cloud.bigquery.InsertAllResponse; +import com.google.cloud.bigquery.StandardTableDefinition; import com.google.cloud.bigquery.Table; +import com.google.cloud.bigquery.LegacySQLTypeName; +import com.google.cloud.bigquery.QueryJobConfiguration; import com.google.cloud.bigquery.TableId; +import com.google.cloud.bigquery.TimePartitioning; +import com.google.cloud.storage.BlobInfo; import com.google.cloud.storage.Storage; import com.wepay.kafka.connect.bigquery.api.SchemaRetriever; import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; -import com.wepay.kafka.connect.bigquery.config.BigQuerySinkTaskConfig; import com.wepay.kafka.connect.bigquery.exception.BigQueryConnectException; -import com.wepay.kafka.connect.bigquery.exception.SinkConfigConnectException; +import com.wepay.kafka.connect.bigquery.write.batch.MergeBatches; +import java.net.SocketTimeoutException; import org.apache.kafka.common.config.ConfigException; import org.apache.kafka.common.record.TimestampType; @@ -49,27 +59,49 @@ import org.apache.kafka.connect.data.SchemaBuilder; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.RetriableException; import org.apache.kafka.connect.sink.SinkRecord; import org.apache.kafka.connect.sink.SinkTaskContext; +import org.junit.After; +import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import org.mockito.ArgumentCaptor; -import org.mockito.Captor; +import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.Map; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.RejectedExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import java.util.stream.IntStream; public class BigQuerySinkTaskTest { private static SinkTaskPropertiesFactory propertiesFactory; + private static AtomicLong spoofedRecordOffset = new AtomicLong(); + @BeforeClass public static void initializePropertiesFactory() { propertiesFactory = new SinkTaskPropertiesFactory(); } + @Before + public void setUp() { + MergeBatches.setStreamingBufferAvailabilityWait(0); + spoofedRecordOffset.set(0); + } + + @After + public void cleanUp() { + MergeBatches.resetStreamingBufferAvailabilityWait(); + } + @Test public void testSimplePut() { final String topic = "test-topic"; @@ -92,8 +124,9 @@ public void testSimplePut() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); @@ -102,6 +135,44 @@ public void testSimplePut() { verify(bigQuery, times(1)).insertAll(any(InsertAllRequest.class)); } + @Test + public void testPutForGCSToBQ() { + final String topic = "test-topic"; + final int repeats = 20; + Map properties = propertiesFactory.getProperties(); + properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); + properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, "scratch"); + properties.put(BigQuerySinkConfig.ENABLE_BATCH_CONFIG, "test-topic"); + + BigQuery bigQuery = mock(BigQuery.class); + Table mockTable = mock(Table.class); + when(bigQuery.getTable(any())).thenReturn(mockTable); + + Storage storage = mock(Storage.class); + + SinkTaskContext sinkTaskContext = mock(SinkTaskContext.class); + InsertAllResponse insertAllResponse = mock(InsertAllResponse.class); + + when(bigQuery.insertAll(anyObject())).thenReturn(insertAllResponse); + when(insertAllResponse.hasErrors()).thenReturn(false); + + SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); + SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); + + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); + testTask.initialize(sinkTaskContext); + testTask.start(properties); + + IntStream.range(0,repeats).forEach(i -> testTask.put(Collections.singletonList(spoofSinkRecord(topic)))); + + ArgumentCaptor blobInfo = ArgumentCaptor.forClass(BlobInfo.class); + testTask.flush(Collections.emptyMap()); + + verify(storage, times(repeats)).create(blobInfo.capture(), (byte[])anyObject()); + assertEquals(repeats, blobInfo.getAllValues().stream().map(info -> info.getBlobId().getName()).collect(Collectors.toSet()).size()); + + } @Test public void testSimplePutWhenSchemaRetrieverIsNotNull() { final String topic = "test-topic"; @@ -123,12 +194,15 @@ public void testSimplePutWhenSchemaRetrieverIsNotNull() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); - testTask.put(Collections.singletonList(spoofSinkRecord(topic))); + SinkRecord spoofedRecord = + spoofSinkRecord(topic, "k", "key", "v", "value", TimestampType.NO_TIMESTAMP_TYPE, null); + testTask.put(Collections.singletonList(spoofedRecord)); testTask.flush(Collections.emptyMap()); verify(bigQuery, times(1)).insertAll(any(InsertAllRequest.class)); } @@ -141,8 +215,9 @@ public void testEmptyPut() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.start(properties); testTask.put(Collections.emptyList()); @@ -163,8 +238,9 @@ public void testEmptyRecordPut() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.start(properties); SinkRecord emptyRecord = spoofSinkRecord(topic, simpleSchema, null); @@ -172,8 +248,6 @@ public void testEmptyRecordPut() { testTask.put(Collections.singletonList(emptyRecord)); } - @Captor ArgumentCaptor captor; - @Test public void testPutWhenPartitioningOnMessageTime() { final String topic = "test-topic"; @@ -181,7 +255,7 @@ public void testPutWhenPartitioningOnMessageTime() { Map properties = propertiesFactory.getProperties(); properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, "scratch"); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG, "true"); + properties.put(BigQuerySinkConfig.BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG, "true"); BigQuery bigQuery = mock(BigQuery.class); Table mockTable = mock(Table.class); @@ -196,8 +270,9 @@ public void testPutWhenPartitioningOnMessageTime() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); @@ -217,8 +292,8 @@ public void testPutWhenPartitioningIsSetToTrue() { Map properties = propertiesFactory.getProperties(); properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, "scratch"); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "true"); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG, "true"); + properties.put(BigQuerySinkConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "true"); + properties.put(BigQuerySinkConfig.BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG, "true"); BigQuery bigQuery = mock(BigQuery.class); Table mockTable = mock(Table.class); @@ -233,8 +308,9 @@ public void testPutWhenPartitioningIsSetToTrue() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); @@ -254,7 +330,7 @@ public void testPutWhenPartitioningIsSetToFalse() { Map properties = propertiesFactory.getProperties(); properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, "scratch"); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "false"); + properties.put(BigQuerySinkConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "false"); BigQuery bigQuery = mock(BigQuery.class); Table mockTable = mock(Table.class); @@ -269,8 +345,9 @@ public void testPutWhenPartitioningIsSetToFalse() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put(Collections.singletonList(spoofSinkRecord(topic, "value", "message text", @@ -290,7 +367,7 @@ public void testPutWhenPartitioningOnMessageTimeWhenNoTimestampType() { Map properties = propertiesFactory.getProperties(); properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, "scratch"); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG, "true"); + properties.put(BigQuerySinkConfig.BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG, "true"); BigQuery bigQuery = mock(BigQuery.class); Table mockTable = mock(Table.class); @@ -305,8 +382,9 @@ public void testPutWhenPartitioningOnMessageTimeWhenNoTimestampType() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); @@ -314,41 +392,115 @@ public void testPutWhenPartitioningOnMessageTimeWhenNoTimestampType() { TimestampType.NO_TIMESTAMP_TYPE, null))); } - // It's important that the buffer be completely wiped after a call to flush, since any execption - // thrown during flush causes Kafka Connect to not commit the offsets for any records sent to the - // task since the last flush @Test - public void testBufferClearOnFlushError() { - final String dataset = "scratch"; - final String topic = "test_topic"; + public void testPutWithUpsertDelete() throws Exception { + final String topic = "test-topic"; + final String key = "kafkaKey"; + final String value = "recordValue"; Map properties = propertiesFactory.getProperties(); properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); - properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, dataset); + properties.put(BigQuerySinkConfig.UPSERT_ENABLED_CONFIG, "true"); + properties.put(BigQuerySinkConfig.DELETE_ENABLED_CONFIG, "true"); + properties.put(BigQuerySinkConfig.MERGE_INTERVAL_MS_CONFIG, "-1"); + properties.put(BigQuerySinkConfig.MERGE_RECORDS_THRESHOLD_CONFIG, "2"); + properties.put(BigQuerySinkConfig.KAFKA_KEY_FIELD_NAME_CONFIG, key); + + BigQuery bigQuery = mock(BigQuery.class); + Storage storage = mock(Storage.class); + SinkTaskContext sinkTaskContext = mock(SinkTaskContext.class); + + InsertAllResponse insertAllResponse = mock(InsertAllResponse.class); + when(bigQuery.insertAll(anyObject())).thenReturn(insertAllResponse); + when(insertAllResponse.hasErrors()).thenReturn(false); + + SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); + SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); + Field keyField = Field.of(key, LegacySQLTypeName.STRING); + Field valueField = Field.of(value, LegacySQLTypeName.STRING); + com.google.cloud.bigquery.Schema intermediateSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder(MergeQueries.INTERMEDIATE_TABLE_BATCH_NUMBER_FIELD, LegacySQLTypeName.INTEGER) + .setMode(Field.Mode.REQUIRED) + .build(), + Field.newBuilder(MergeQueries.INTERMEDIATE_TABLE_PARTITION_TIME_FIELD_NAME, LegacySQLTypeName.TIMESTAMP) + .setMode(Field.Mode.NULLABLE) + .build(), + Field.newBuilder(MergeQueries.INTERMEDIATE_TABLE_KEY_FIELD_NAME, LegacySQLTypeName.RECORD, keyField) + .setMode(Field.Mode.REQUIRED) + .build(), + Field.newBuilder(MergeQueries.INTERMEDIATE_TABLE_VALUE_FIELD_NAME, LegacySQLTypeName.RECORD, valueField) + .build() + ); + when(schemaManager.cachedSchema(any())).thenReturn(intermediateSchema); + + CountDownLatch executedMerges = new CountDownLatch(2); + CountDownLatch executedBatchClears = new CountDownLatch(2); + + when(bigQuery.query(any(QueryJobConfiguration.class))).then(invocation -> { + String query = invocation.getArgument(0, QueryJobConfiguration.class).getQuery(); + if (query.startsWith("MERGE")) { + executedMerges.countDown(); + } else if (query.startsWith("DELETE")) { + executedBatchClears.countDown(); + } + return null; + }); + + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); + testTask.initialize(sinkTaskContext); + testTask.start(properties); + + // Insert a few regular records and one tombstone record + testTask.put(Arrays.asList( + spoofSinkRecord(topic, key, "4761", "value", "message text", TimestampType.NO_TIMESTAMP_TYPE, null), + spoofSinkRecord(topic, key, "489", "value", "other message text", TimestampType.NO_TIMESTAMP_TYPE, null), + spoofSinkRecord(topic, key, "28980", "value", "more message text", TimestampType.NO_TIMESTAMP_TYPE, null), + spoofSinkRecord(topic, key, "4761", null, null, TimestampType.NO_TIMESTAMP_TYPE, null) + )); + + assertTrue("Merge queries should be executed", executedMerges.await(5, TimeUnit.SECONDS)); + assertTrue("Batch clears should be executed", executedBatchClears.await(1, TimeUnit.SECONDS)); + } + + // Throw an exception on the first put, and assert the Exception will be exposed in subsequent + // put call. + @Test(expected = BigQueryConnectException.class, timeout = 30000L) + public void testSimplePutException() throws InterruptedException { + final String topic = "test-topic"; + Map properties = propertiesFactory.getProperties(); + properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); + properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, "scratch"); BigQuery bigQuery = mock(BigQuery.class); Table mockTable = mock(Table.class); when(bigQuery.getTable(any())).thenReturn(mockTable); Storage storage = mock(Storage.class); - when(bigQuery.insertAll(any(InsertAllRequest.class))) - .thenThrow(new RuntimeException("This is a test")); + String error = "Cannot add required fields to an existing schema."; + SinkTaskContext sinkTaskContext = mock(SinkTaskContext.class); + when(bigQuery.insertAll(any())) + .thenThrow( + new BigQueryException(400, error, new BigQueryError("invalid", "global", error))); SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - SinkTaskContext sinkTaskContext = mock(SinkTaskContext.class); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = + new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); + testTask.put(Collections.singletonList(spoofSinkRecord(topic))); try { - testTask.put(Collections.singletonList(spoofSinkRecord(topic))); - testTask.flush(Collections.emptyMap()); - fail("An exception should have been thrown by now"); - } catch (BigQueryConnectException err) { - testTask.flush(Collections.emptyMap()); - verify(bigQuery, times(1)).insertAll(any(InsertAllRequest.class)); + while (true) { + Thread.sleep(100); + testTask.put(Collections.emptyList()); + } + } catch (Exception e) { + assertTrue(e.getCause().getCause().getMessage().contains(error)); + throw e; } } @@ -360,12 +512,76 @@ public void testEmptyFlush() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); + + SinkTaskContext sinkTaskContext = mock(SinkTaskContext.class); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); + testTask.initialize(sinkTaskContext); + testTask.start(properties); + + testTask.flush(Collections.emptyMap()); + } + + @Test + public void testFlushAfterStop() { + Map properties = propertiesFactory.getProperties(); + Storage storage = mock(Storage.class); + + BigQuery bigQuery = mock(BigQuery.class); + when(bigQuery.insertAll(any())) + .thenThrow( + new BigQueryException(400, "Oops", new BigQueryError("invalid", "global", "oops"))); + + SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); + SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); SinkTaskContext sinkTaskContext = mock(SinkTaskContext.class); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); + testTask.put(Collections.singletonList(spoofSinkRecord("t"))); + assertThrows( + "first call to flush should fail", + Exception.class, + () -> testTask.flush(Collections.emptyMap())); + assertThrows( + "second call to flush should fail", + Exception.class, + () -> testTask.flush(Collections.emptyMap())); + testTask.stop(); + assertThrows( + "third call to flush (after task stop) should fail", + Exception.class, + () -> testTask.flush(Collections.emptyMap())); + } + + @Test(expected = RetriableException.class) + public void testBigQueryReadTimeout() { + final String topic = "test_topic"; + final String dataset = "scratch"; + + Map properties = propertiesFactory.getProperties(); + properties.put(BigQuerySinkConfig.BIGQUERY_RETRY_CONFIG, "3"); + properties.put(BigQuerySinkConfig.BIGQUERY_RETRY_WAIT_CONFIG, "2000"); + properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); + properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, dataset); + + BigQuery bigQuery = mock(BigQuery.class); + when(bigQuery.getTable(any())).thenThrow(new BigQueryException(new SocketTimeoutException("mock timeout"))); + + Storage storage = mock(Storage.class); + SinkTaskContext sinkTaskContext = mock(SinkTaskContext.class); + + SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); + SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); + + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); + testTask.initialize(sinkTaskContext); + testTask.start(properties); + testTask.put(Collections.singletonList(spoofSinkRecord(topic))); testTask.flush(Collections.emptyMap()); } @@ -375,8 +591,8 @@ public void testBigQuery5XXRetry() { final String dataset = "scratch"; Map properties = propertiesFactory.getProperties(); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_RETRY_CONFIG, "3"); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_RETRY_WAIT_CONFIG, "2000"); + properties.put(BigQuerySinkConfig.BIGQUERY_RETRY_CONFIG, "3"); + properties.put(BigQuerySinkConfig.BIGQUERY_RETRY_WAIT_CONFIG, "2000"); properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, dataset); @@ -398,8 +614,9 @@ public void testBigQuery5XXRetry() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put(Collections.singletonList(spoofSinkRecord(topic))); @@ -414,8 +631,8 @@ public void testBigQuery403Retry() { final String dataset = "scratch"; Map properties = propertiesFactory.getProperties(); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_RETRY_CONFIG, "2"); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_RETRY_WAIT_CONFIG, "2000"); + properties.put(BigQuerySinkConfig.BIGQUERY_RETRY_CONFIG, "2"); + properties.put(BigQuerySinkConfig.BIGQUERY_RETRY_WAIT_CONFIG, "2000"); properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, dataset); @@ -438,8 +655,9 @@ public void testBigQuery403Retry() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put(Collections.singletonList(spoofSinkRecord(topic))); @@ -454,8 +672,8 @@ public void testBigQueryRetryExceeded() { final String dataset = "scratch"; Map properties = propertiesFactory.getProperties(); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_RETRY_CONFIG, "1"); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_RETRY_WAIT_CONFIG, "2000"); + properties.put(BigQuerySinkConfig.BIGQUERY_RETRY_CONFIG, "1"); + properties.put(BigQuerySinkConfig.BIGQUERY_RETRY_WAIT_CONFIG, "2000"); properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, dataset); @@ -475,8 +693,9 @@ public void testBigQueryRetryExceeded() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put(Collections.singletonList(spoofSinkRecord(topic))); @@ -505,9 +724,10 @@ public void testInterruptedException() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); SinkTaskContext sinkTaskContext = mock(SinkTaskContext.class); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); @@ -519,23 +739,33 @@ public void testInterruptedException() { testTask.flush(Collections.emptyMap()); } - // Make sure that a ConfigException is properly translated into a SinkConfigConnectException - @Test(expected = SinkConfigConnectException.class) - public void testConfigException() { - try { - Map badProperties = propertiesFactory.getProperties(); - badProperties.remove(BigQuerySinkConfig.TOPICS_CONFIG); - BigQuerySinkConfig.validate(badProperties); - - SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); - SchemaManager schemaManager = mock(SchemaManager.class); - - BigQuerySinkTask testTask = - new BigQuerySinkTask(mock(BigQuery.class), schemaRetriever, mock(Storage.class), schemaManager); - testTask.start(badProperties); - } catch (ConfigException e) { - throw new SinkConfigConnectException(e); - } + @Test(expected = ConnectException.class) + public void testTimePartitioningIncompatibleWithDecoratorSyntax() { + final String topic = "t1"; + final String dataset = "d"; + + Map properties = propertiesFactory.getProperties(); + properties.put(BigQuerySinkConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "true"); + properties.put(BigQuerySinkConfig.BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG, "true"); + properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); + properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, dataset); + + StandardTableDefinition mockTableDefinition = mock(StandardTableDefinition.class); + when(mockTableDefinition.getTimePartitioning()).thenReturn(TimePartitioning.of(TimePartitioning.Type.HOUR)); + Table table = mock(Table.class); + when(table.getDefinition()).thenReturn(mockTableDefinition); + Map tableCache = new HashMap<>(); + tableCache.put(TableId.of(dataset, topic), table); + + Storage storage = mock(Storage.class); + BigQuery bigQuery = mock(BigQuery.class); + + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, null, storage, null, tableCache); + SinkTaskContext sinkTaskContext = mock(SinkTaskContext.class); + testTask.initialize(sinkTaskContext); + testTask.start(properties); + + testTask.put(Collections.singleton(spoofSinkRecord(topic, "f1", "v1", TimestampType.CREATE_TIME, 1L))); } @Test @@ -566,9 +796,10 @@ public void testStop() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); Storage storage = mock(Storage.class); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put(Collections.singletonList(spoofSinkRecord(topic))); @@ -581,39 +812,72 @@ public void testStop() { testTask.put(Collections.singletonList(spoofSinkRecord(topic))); } + @Test + public void testKafkaProviderConfigInvalidValue() { + Map configProperties = propertiesFactory.getProperties(); + String testKafkaProvider = "testProvider"; + configProperties.put(CONNECTOR_RUNTIME_PROVIDER_CONFIG, testKafkaProvider); + BigQuerySinkConfig config = new BigQuerySinkConfig(configProperties); + + GcpClientBuilder clientBuilder = new GcpClientBuilder.BigQueryBuilder().withConfig(config); + assertTrue(clientBuilder.getHeaderProvider().getHeaders().get("user-agent").contains(CONNECTOR_RUNTIME_PROVIDER_DEFAULT)); + + GcpClientBuilder storageBuilder = new GcpClientBuilder.GcsBuilder().withConfig(config); + assertTrue(storageBuilder.getHeaderProvider().getHeaders().get("user-agent").contains(CONNECTOR_RUNTIME_PROVIDER_DEFAULT)); + } + /** - * Utility method for spoofing InsertAllRequests that should be sent to a BigQuery object. - * @param table The table to write to. - * @param rows The rows to write. - * @return The spoofed InsertAllRequest. + * Utility method for spoofing SinkRecords that should be passed to SinkTask.put() + * @param topic The topic of the record. + * @param keyField The field name for the record key; may be null. + * @param key The content of the record key; may be null. + * @param valueField The field name for the record value; may be null + * @param value The content of the record value; may be null + * @param timestampType The type of timestamp embedded in the message + * @param timestamp The timestamp in milliseconds + * @return The spoofed SinkRecord. */ - public static InsertAllRequest buildExpectedInsertAllRequest( - TableId table, - InsertAllRequest.RowToInsert... rows) { - return InsertAllRequest.newBuilder(table, rows) - .setIgnoreUnknownValues(false) - .setSkipInvalidRows(false) - .build(); + public static SinkRecord spoofSinkRecord(String topic, String keyField, String key, + String valueField, String value, + TimestampType timestampType, Long timestamp) { + Schema basicKeySchema = null; + Struct basicKey = null; + if (keyField != null) { + basicKeySchema = SchemaBuilder + .struct() + .field(keyField, Schema.STRING_SCHEMA) + .build(); + basicKey = new Struct(basicKeySchema); + basicKey.put(keyField, key); + } + + Schema basicValueSchema = null; + Struct basicValue = null; + if (valueField != null) { + basicValueSchema = SchemaBuilder + .struct() + .field(valueField, Schema.STRING_SCHEMA) + .build(); + basicValue = new Struct(basicValueSchema); + basicValue.put(valueField, value); + } + + return new SinkRecord(topic, 0, basicKeySchema, basicKey, + basicValueSchema, basicValue, spoofedRecordOffset.getAndIncrement(), timestamp, timestampType); } /** * Utility method for spoofing SinkRecords that should be passed to SinkTask.put() * @param topic The topic of the record. - * @param value The content of the record. + * @param field The field name for the record value. + * @param value The content of the record value. * @param timestampType The type of timestamp embedded in the message * @param timestamp The timestamp in milliseconds * @return The spoofed SinkRecord. */ public static SinkRecord spoofSinkRecord(String topic, String field, String value, TimestampType timestampType, Long timestamp) { - Schema basicRowSchema = SchemaBuilder - .struct() - .field(field, Schema.STRING_SCHEMA) - .build(); - Struct basicRowValue = new Struct(basicRowSchema); - basicRowValue.put(field, value); - return new SinkRecord(topic, 0, null, null, - basicRowSchema, basicRowValue, 0, timestamp, timestampType); + return spoofSinkRecord(topic, null, null, field, value, timestampType, timestamp); } /** diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/GCSToBQLoadRunnableTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/GCSToBQLoadRunnableTest.java index 38c2ef30a..af4fb5f48 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/GCSToBQLoadRunnableTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/GCSToBQLoadRunnableTest.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery; - /* - * Copyright 2018 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/MergeQueriesTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/MergeQueriesTest.java new file mode 100644 index 000000000..293b7736a --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/MergeQueriesTest.java @@ -0,0 +1,315 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.LegacySQLTypeName; +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.TableId; +import com.wepay.kafka.connect.bigquery.write.batch.KCBQThreadPoolExecutor; +import com.wepay.kafka.connect.bigquery.write.batch.MergeBatches; +import org.apache.kafka.connect.sink.SinkTaskContext; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.junit.MockitoJUnitRunner; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.mockito.Mockito.when; + +@RunWith(MockitoJUnitRunner.class) +public class MergeQueriesTest { + + private static final String KEY = "kafkaKey"; + + private static final int BATCH_NUMBER = 42; + private static final TableId DESTINATION_TABLE = TableId.of("ds1", "t"); + private static final TableId INTERMEDIATE_TABLE = TableId.of("ds1", "t_tmp_6_uuid_epoch"); + private static final Schema INTERMEDIATE_TABLE_SCHEMA = constructIntermediateTable(); + + @Mock private MergeBatches mergeBatches; + @Mock private KCBQThreadPoolExecutor executor; + @Mock private BigQuery bigQuery; + @Mock private SchemaManager schemaManager; + @Mock private SinkTaskContext context; + + @Before + public void setUp() { + when(schemaManager.cachedSchema(INTERMEDIATE_TABLE)).thenReturn(INTERMEDIATE_TABLE_SCHEMA); + } + + private MergeQueries mergeQueries(boolean insertPartitionTime, boolean upsert, boolean delete) { + return new MergeQueries( + KEY, insertPartitionTime, upsert, delete, mergeBatches, executor, bigQuery, schemaManager, context + ); + } + + private static Schema constructIntermediateTable() { + List fields = new ArrayList<>(); + + List valueFields = Arrays.asList( + Field.of("f1", LegacySQLTypeName.STRING), + Field.of("f2", LegacySQLTypeName.RECORD, + Field.of("nested_f1", LegacySQLTypeName.INTEGER) + ), + Field.of("f3", LegacySQLTypeName.BOOLEAN), + Field.of("f4", LegacySQLTypeName.BYTES) + ); + Field wrappedValueField = Field + .newBuilder(MergeQueries.INTERMEDIATE_TABLE_VALUE_FIELD_NAME, LegacySQLTypeName.RECORD, valueFields.toArray(new Field[0])) + .setMode(Field.Mode.NULLABLE) + .build(); + fields.add(wrappedValueField); + + List keyFields = Arrays.asList( + Field.of("k1", LegacySQLTypeName.STRING), + Field.of("k2", LegacySQLTypeName.RECORD, + Field.of("nested_k1", LegacySQLTypeName.RECORD, + Field.of("doubly_nested_k", LegacySQLTypeName.BOOLEAN) + ), + Field.of("nested_k2", LegacySQLTypeName.INTEGER) + ) + ); + Field kafkaKeyField = Field.newBuilder(MergeQueries.INTERMEDIATE_TABLE_KEY_FIELD_NAME, LegacySQLTypeName.RECORD, keyFields.toArray(new Field[0])) + .setMode(Field.Mode.REQUIRED) + .build(); + fields.add(kafkaKeyField); + + Field partitionTimeField = Field + .newBuilder(MergeQueries.INTERMEDIATE_TABLE_PARTITION_TIME_FIELD_NAME, LegacySQLTypeName.TIMESTAMP) + .setMode(Field.Mode.NULLABLE) + .build(); + fields.add(partitionTimeField); + + Field batchNumberField = Field + .newBuilder(MergeQueries.INTERMEDIATE_TABLE_BATCH_NUMBER_FIELD, LegacySQLTypeName.INTEGER) + .setMode(Field.Mode.REQUIRED) + .build(); + fields.add(batchNumberField); + + return Schema.of(fields); + } + + @Test + public void testUpsertQueryWithPartitionTime() { + String expectedQuery = + "MERGE " + table(DESTINATION_TABLE) + " " + + "USING (SELECT * FROM (SELECT ARRAY_AGG(x ORDER BY i DESC LIMIT 1)[OFFSET(0)] src " + + "FROM " + table(INTERMEDIATE_TABLE) + " x " + + "WHERE batchNumber=" + BATCH_NUMBER + " " + + "GROUP BY key.k1, key.k2.nested_k1.doubly_nested_k, key.k2.nested_k2)) " + + "ON `" + DESTINATION_TABLE.getTable() + "`." + KEY + "=src.key " + + "WHEN MATCHED " + + "THEN UPDATE SET `f1`=src.value.f1, `f2`=src.value.f2, `f3`=src.value.f3, `f4`=src.value.f4 " + + "WHEN NOT MATCHED " + + "THEN INSERT (`" + + KEY + "`, " + + "_PARTITIONTIME, " + + "`f1`, `f2`, `f3`, `f4`) " + + "VALUES (" + + "src.key, " + + "CAST(CAST(DATE(src.partitionTime) AS DATE) AS TIMESTAMP), " + + "src.value.f1, src.value.f2, src.value.f3, src.value.f4" + + ");"; + String actualQuery = mergeQueries(true, true, false) + .mergeFlushQuery(INTERMEDIATE_TABLE, DESTINATION_TABLE, BATCH_NUMBER); + assertEquals(expectedQuery, actualQuery); + } + + @Test + public void testUpsertQueryWithoutPartitionTime() { + String expectedQuery = + "MERGE " + table(DESTINATION_TABLE) + " " + + "USING (SELECT * FROM (SELECT ARRAY_AGG(x ORDER BY i DESC LIMIT 1)[OFFSET(0)] src " + + "FROM " + table(INTERMEDIATE_TABLE) + " x " + + "WHERE batchNumber=" + BATCH_NUMBER + " " + + "GROUP BY key.k1, key.k2.nested_k1.doubly_nested_k, key.k2.nested_k2)) " + + "ON `" + DESTINATION_TABLE.getTable() + "`." + KEY + "=src.key " + + "WHEN MATCHED " + + "THEN UPDATE SET `f1`=src.value.f1, `f2`=src.value.f2, `f3`=src.value.f3, `f4`=src.value.f4 " + + "WHEN NOT MATCHED " + + "THEN INSERT (`" + + KEY + "`, " + + "`f1`, `f2`, `f3`, `f4`) " + + "VALUES (" + + "src.key, " + + "src.value.f1, src.value.f2, src.value.f3, src.value.f4" + + ");"; + String actualQuery = mergeQueries(false, true, false) + .mergeFlushQuery(INTERMEDIATE_TABLE, DESTINATION_TABLE, BATCH_NUMBER); + assertEquals(expectedQuery, actualQuery); + } + + @Test + public void testDeleteQueryWithPartitionTime() { + String expectedQuery = + "MERGE " + table(DESTINATION_TABLE) + " " + + "USING (" + + "SELECT batch.key AS key, partitionTime, value " + + "FROM (" + + "SELECT src.i, src.key FROM (" + + "SELECT ARRAY_AGG(" + + "x ORDER BY i DESC LIMIT 1" + + ")[OFFSET(0)] src " + + "FROM (" + + "SELECT * FROM " + table(INTERMEDIATE_TABLE) + " " + + "WHERE batchNumber=" + BATCH_NUMBER + + ") x " + + "WHERE x.value IS NULL " + + "GROUP BY key.k1, key.k2.nested_k1.doubly_nested_k, key.k2.nested_k2)) AS deletes " + + "RIGHT JOIN (" + + "SELECT * FROM " + table(INTERMEDIATE_TABLE) + " " + + "WHERE batchNumber=" + BATCH_NUMBER + + ") AS batch " + + "USING (key) " + + "WHERE deletes.i IS NULL OR batch.i >= deletes.i " + + "ORDER BY batch.i ASC) AS src " + + "ON `" + DESTINATION_TABLE.getTable() + "`." + KEY + "=src.key AND src.value IS NULL " + + "WHEN MATCHED " + + "THEN DELETE " + + "WHEN NOT MATCHED AND src.value IS NOT NULL " + + "THEN INSERT (`" + + KEY + "`, " + + "_PARTITIONTIME, " + + "`f1`, `f2`, `f3`, `f4`) " + + "VALUES (" + + "src.key, " + + "CAST(CAST(DATE(src.partitionTime) AS DATE) AS TIMESTAMP), " + + "src.value.f1, src.value.f2, src.value.f3, src.value.f4" + + ");"; + String actualQuery = mergeQueries(true, false, true) + .mergeFlushQuery(INTERMEDIATE_TABLE, DESTINATION_TABLE, BATCH_NUMBER); + assertEquals(expectedQuery, actualQuery); + } + + @Test + public void testDeleteQueryWithoutPartitionTime() { + String expectedQuery = + "MERGE " + table(DESTINATION_TABLE) + " " + + "USING (" + + "SELECT batch.key AS key, value " + + "FROM (" + + "SELECT src.i, src.key FROM (" + + "SELECT ARRAY_AGG(" + + "x ORDER BY i DESC LIMIT 1" + + ")[OFFSET(0)] src " + + "FROM (" + + "SELECT * FROM " + table(INTERMEDIATE_TABLE) + " " + + "WHERE batchNumber=" + BATCH_NUMBER + + ") x " + + "WHERE x.value IS NULL " + + "GROUP BY key.k1, key.k2.nested_k1.doubly_nested_k, key.k2.nested_k2)) AS deletes " + + "RIGHT JOIN (" + + "SELECT * FROM " + table(INTERMEDIATE_TABLE) + " " + + "WHERE batchNumber=" + BATCH_NUMBER + + ") AS batch " + + "USING (key) " + + "WHERE deletes.i IS NULL OR batch.i >= deletes.i " + + "ORDER BY batch.i ASC) AS src " + + "ON `" + DESTINATION_TABLE.getTable() + "`." + KEY + "=src.key AND src.value IS NULL " + + "WHEN MATCHED " + + "THEN DELETE " + + "WHEN NOT MATCHED AND src.value IS NOT NULL " + + "THEN INSERT (`" + + KEY + "`, " + + "`f1`, `f2`, `f3`, `f4`) " + + "VALUES (" + + "src.key, " + + "src.value.f1, src.value.f2, src.value.f3, src.value.f4" + + ");"; + String actualQuery = mergeQueries(false, false, true) + .mergeFlushQuery(INTERMEDIATE_TABLE, DESTINATION_TABLE, BATCH_NUMBER); + assertEquals(expectedQuery, actualQuery); + } + + @Test + public void testUpsertDeleteQueryWithPartitionTime() { + String expectedQuery = + "MERGE " + table(DESTINATION_TABLE) + " " + + "USING (SELECT * FROM (SELECT ARRAY_AGG(x ORDER BY i DESC LIMIT 1)[OFFSET(0)] src " + + "FROM " + table(INTERMEDIATE_TABLE) + " x " + + "WHERE batchNumber=" + BATCH_NUMBER + " " + + "GROUP BY key.k1, key.k2.nested_k1.doubly_nested_k, key.k2.nested_k2)) " + + "ON `" + DESTINATION_TABLE.getTable() + "`." + KEY + "=src.key " + + "WHEN MATCHED AND src.value IS NOT NULL " + + "THEN UPDATE SET `f1`=src.value.f1, `f2`=src.value.f2, `f3`=src.value.f3, `f4`=src.value.f4 " + + "WHEN MATCHED AND src.value IS NULL " + + "THEN DELETE " + + "WHEN NOT MATCHED AND src.value IS NOT NULL " + + "THEN INSERT (`" + + KEY + "`, " + + "_PARTITIONTIME, " + + "`f1`, `f2`, `f3`, `f4`) " + + "VALUES (" + + "src.key, " + + "CAST(CAST(DATE(src.partitionTime) AS DATE) AS TIMESTAMP), " + + "src.value.f1, src.value.f2, src.value.f3, src.value.f4" + + ");"; + String actualQuery = mergeQueries(true, true, true) + .mergeFlushQuery(INTERMEDIATE_TABLE, DESTINATION_TABLE, BATCH_NUMBER); + assertEquals(expectedQuery, actualQuery); + } + + @Test + public void testUpsertDeleteQueryWithoutPartitionTime() { + String expectedQuery = + "MERGE " + table(DESTINATION_TABLE) + " " + + "USING (SELECT * FROM (SELECT ARRAY_AGG(x ORDER BY i DESC LIMIT 1)[OFFSET(0)] src " + + "FROM " + table(INTERMEDIATE_TABLE) + " x " + + "WHERE batchNumber=" + BATCH_NUMBER + " " + + "GROUP BY key.k1, key.k2.nested_k1.doubly_nested_k, key.k2.nested_k2)) " + + "ON `" + DESTINATION_TABLE.getTable() + "`." + KEY + "=src.key " + + "WHEN MATCHED AND src.value IS NOT NULL " + + "THEN UPDATE SET `f1`=src.value.f1, `f2`=src.value.f2, `f3`=src.value.f3, `f4`=src.value.f4 " + + "WHEN MATCHED AND src.value IS NULL " + + "THEN DELETE " + + "WHEN NOT MATCHED AND src.value IS NOT NULL " + + "THEN INSERT (`" + + KEY + "`, " + + "`f1`, `f2`, `f3`, `f4`) " + + "VALUES (" + + "src.key, " + + "src.value.f1, src.value.f2, src.value.f3, src.value.f4" + + ");"; String actualQuery = mergeQueries(false, true, true) + .mergeFlushQuery(INTERMEDIATE_TABLE, DESTINATION_TABLE, BATCH_NUMBER); + assertEquals(expectedQuery, actualQuery); + } + + @Test + public void testBatchClearQuery() { + String expectedQuery = + "DELETE FROM " + table(INTERMEDIATE_TABLE) + + " WHERE batchNumber <= " + BATCH_NUMBER + + " AND _PARTITIONTIME IS NOT NULL;"; + // No difference in batch clearing between upsert, delete, and both, or with or without partition time + String actualQuery = MergeQueries.batchClearQuery(INTERMEDIATE_TABLE, BATCH_NUMBER); + assertEquals(expectedQuery, actualQuery); + } + + private String table(TableId table) { + return String.format("`%s`.`%s`", table.getDataset(), table.getTable()); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SchemaManagerTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SchemaManagerTest.java index fe5fa73cb..74a4b7462 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SchemaManagerTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SchemaManagerTest.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,29 +17,53 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.Field.Mode; import com.google.cloud.bigquery.LegacySQLTypeName; import com.google.cloud.bigquery.StandardTableDefinition; +import com.google.cloud.bigquery.Table; +import com.google.cloud.bigquery.TableDefinition; import com.google.cloud.bigquery.TableId; import com.google.cloud.bigquery.TableInfo; +import com.google.cloud.bigquery.TimePartitioning; +import com.google.common.collect.ImmutableList; import com.wepay.kafka.connect.bigquery.api.SchemaRetriever; +import com.wepay.kafka.connect.bigquery.convert.BigQuerySchemaConverter; import com.wepay.kafka.connect.bigquery.convert.SchemaConverter; +import com.wepay.kafka.connect.bigquery.exception.BigQueryConnectException; +import com.wepay.kafka.connect.bigquery.retrieve.IdentitySchemaRetriever; +import java.util.Random; + +import com.wepay.kafka.connect.bigquery.utils.FieldNameSanitizer; import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.sink.SinkRecord; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import org.mockito.ArgumentCaptor; +import org.mockito.stubbing.OngoingStubbing; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Optional; +import java.util.stream.Collectors; public class SchemaManagerTest { @@ -70,37 +94,183 @@ public void testBQTableDescription() { Optional kafkaKeyFieldName = Optional.of("kafkaKey"); Optional kafkaDataFieldName = Optional.of("kafkaData"); SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, - mockBigQuery, false, false, kafkaKeyFieldName, kafkaDataFieldName, Optional.empty(), Optional.empty()); + mockBigQuery, false, false, false, false, kafkaKeyFieldName, kafkaDataFieldName, + Optional.empty(), Optional.empty(), Optional.empty(), Optional.of(TimePartitioning.Type.DAY)); when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema); when(mockKafkaSchema.doc()).thenReturn(testDoc); TableInfo tableInfo = schemaManager - .constructTableInfo(tableId, fakeBigQuerySchema, testDoc); + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, true); - Assert.assertEquals("Kafka doc does not match BigQuery table description", + assertEquals("Kafka doc does not match BigQuery table description", testDoc, tableInfo.getDescription()); Assert.assertNull("Timestamp partition field name is not null", ((StandardTableDefinition) tableInfo.getDefinition()).getTimePartitioning().getField()); + Assert.assertNull("Partition expiration is not null", + ((StandardTableDefinition) tableInfo.getDefinition()).getTimePartitioning().getExpirationMs()); } @Test public void testTimestampPartitionSet() { Optional testField = Optional.of("testField"); SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, - mockBigQuery, false, false, Optional.empty(), Optional.empty(), testField, Optional.empty()); + mockBigQuery, false, false, false, false, Optional.empty(), Optional.empty(), testField, + Optional.empty(), Optional.empty(), Optional.of(TimePartitioning.Type.DAY)); when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema); when(mockKafkaSchema.doc()).thenReturn(testDoc); TableInfo tableInfo = schemaManager - .constructTableInfo(tableId, fakeBigQuerySchema, testDoc); + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, true); + + assertEquals("Kafka doc does not match BigQuery table description", + testDoc, tableInfo.getDescription()); + StandardTableDefinition definition = tableInfo.getDefinition(); + Assert.assertNotNull(definition.getTimePartitioning()); + Assert.assertEquals(TimePartitioning.Type.DAY, definition.getTimePartitioning().getType()); + Assert.assertEquals("The field name does not match the field name of time partition", + testField.get(), + definition.getTimePartitioning().getField()); + Assert.assertNull("Partition expiration is not null", + ((StandardTableDefinition) tableInfo.getDefinition()).getTimePartitioning().getExpirationMs()); + } + + @Test + public void testAlternativeTimestampPartitionType() { + SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, + mockBigQuery, false, false, false, false, Optional.empty(), Optional.empty(), Optional.empty(), + Optional.empty(), Optional.empty(), Optional.of(TimePartitioning.Type.HOUR)); + + when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema); + when(mockKafkaSchema.doc()).thenReturn(testDoc); + + TableInfo tableInfo = schemaManager + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, true); + + Assert.assertEquals("Kafka doc does not match BigQuery table description", + testDoc, tableInfo.getDescription()); + StandardTableDefinition definition = tableInfo.getDefinition(); + Assert.assertNotNull(definition.getTimePartitioning()); + Assert.assertEquals(TimePartitioning.Type.HOUR, definition.getTimePartitioning().getType()); + } + + @Test + public void testNoTimestampPartitionType() { + SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, + mockBigQuery, false, false, false, false, Optional.empty(), Optional.empty(), Optional.empty(), + Optional.empty(), Optional.empty(), Optional.empty()); + + when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema); + when(mockKafkaSchema.doc()).thenReturn(testDoc); + + TableInfo tableInfo = schemaManager + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, true); Assert.assertEquals("Kafka doc does not match BigQuery table description", testDoc, tableInfo.getDescription()); + StandardTableDefinition definition = tableInfo.getDefinition(); + Assert.assertNull(definition.getTimePartitioning()); + } + + @Test + public void testUpdateTimestampPartitionNull() { + Optional testField = Optional.of("testField"); + SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, + mockBigQuery, false, false, false, false, Optional.empty(), Optional.empty(), testField, + Optional.empty(), Optional.empty(), Optional.of(TimePartitioning.Type.DAY)); + + when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema); + when(mockKafkaSchema.doc()).thenReturn(testDoc); + + TableInfo tableInfo = schemaManager + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, false); + + Assert.assertEquals("Kafka doc does not match BigQuery table description", + testDoc, tableInfo.getDescription()); + Assert.assertNull("The time partitioning object should be null", + ((StandardTableDefinition) tableInfo.getDefinition()).getTimePartitioning()); + } + + @Test + public void testUpdateTimestampPartitionNotSet() { + Optional testField = Optional.of("testField"); + SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, + mockBigQuery, false, false, false, false, Optional.empty(), Optional.empty(), testField, + Optional.empty(), Optional.empty(), Optional.of(TimePartitioning.Type.DAY)); + + when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema); + when(mockKafkaSchema.doc()).thenReturn(testDoc); + + TableInfo tableInfo = schemaManager + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, true); + + Assert.assertEquals("Kafka doc does not match BigQuery table description", + testDoc, tableInfo.getDescription()); + StandardTableDefinition definition = tableInfo.getDefinition(); + Assert.assertNotNull(definition.getTimePartitioning()); Assert.assertEquals("The field name does not match the field name of time partition", testField.get(), - ((StandardTableDefinition) tableInfo.getDefinition()).getTimePartitioning().getField()); + definition.getTimePartitioning().getField()); + + Optional updateField = Optional.of("testUpdateField"); + schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, + mockBigQuery, false, false, false, false, Optional.empty(), Optional.empty(), updateField, Optional.empty(), Optional.empty(), + Optional.of(TimePartitioning.Type.DAY)); + + tableInfo = schemaManager + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, false); + definition = tableInfo.getDefinition(); + Assert.assertNull("The time partitioning object should be null", + ((StandardTableDefinition) tableInfo.getDefinition()).getTimePartitioning()); + } + + @Test + public void testPartitionExpirationSetWithoutFieldName() { + Optional testExpirationMs = Optional.of(86400000L); + SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, + mockBigQuery, false, false, false, false, Optional.empty(), Optional.empty(), Optional.empty(), + testExpirationMs, Optional.empty(), Optional.of(TimePartitioning.Type.DAY)); + + when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema); + when(mockKafkaSchema.doc()).thenReturn(testDoc); + + TableInfo tableInfo = schemaManager + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, true); + + Assert.assertEquals("Kafka doc does not match BigQuery table description", + testDoc, tableInfo.getDescription()); + StandardTableDefinition tableDefinition = (StandardTableDefinition) tableInfo.getDefinition(); + Assert.assertEquals("The partition expiration does not match the expiration in ms", + testExpirationMs.get(), + tableDefinition.getTimePartitioning().getExpirationMs()); + Assert.assertNull("Timestamp partition field name is not null", + tableDefinition.getTimePartitioning().getField()); + } + + @Test + public void testPartitionExpirationSetWithFieldName() { + Optional testExpirationMs = Optional.of(86400000L); + Optional testField = Optional.of("testField"); + SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, + mockBigQuery, false, false, false, false, Optional.empty(), Optional.empty(), testField, + testExpirationMs, Optional.empty(), Optional.of(TimePartitioning.Type.DAY)); + + when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema); + when(mockKafkaSchema.doc()).thenReturn(testDoc); + + TableInfo tableInfo = schemaManager + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, true); + + Assert.assertEquals("Kafka doc does not match BigQuery table description", + testDoc, tableInfo.getDescription()); + StandardTableDefinition tableDefinition = (StandardTableDefinition) tableInfo.getDefinition(); + Assert.assertEquals("The partition expiration does not match the expiration in ms", + testExpirationMs.get(), + tableDefinition.getTimePartitioning().getExpirationMs()); + Assert.assertEquals("The field name does not match the field name of time partition", + testField.get(), + tableDefinition.getTimePartitioning().getField()); } @Test @@ -108,20 +278,688 @@ public void testClusteringPartitionSet() { Optional timestampPartitionFieldName = Optional.of("testField"); Optional> testField = Optional.of(Arrays.asList("column1", "column2")); SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, - mockBigQuery, false, false, Optional.empty(), Optional.empty(), timestampPartitionFieldName, testField); + mockBigQuery, false, false, false, false, Optional.empty(), Optional.empty(), timestampPartitionFieldName, + Optional.empty(), testField, Optional.of(TimePartitioning.Type.DAY)); + + when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema); + when(mockKafkaSchema.doc()).thenReturn(testDoc); + + TableInfo tableInfo = schemaManager + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, true); + + assertEquals("Kafka doc does not match BigQuery table description", + testDoc, tableInfo.getDescription()); + StandardTableDefinition definition = tableInfo.getDefinition(); + Assert.assertNotNull(definition.getClustering()); + assertEquals("The field name does not match the field name of time partition", + testField.get(), + definition.getClustering().getFields()); + } + + @Test + public void testUpdateClusteringPartitionNull() { + Optional timestampPartitionFieldName = Optional.of("testField"); + Optional> testField = Optional.of(Arrays.asList("column1", "column2")); + SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, + mockBigQuery, false, false, false, false, Optional.empty(), Optional.empty(), timestampPartitionFieldName, + Optional.empty(), testField, Optional.of(TimePartitioning.Type.DAY)); when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema); when(mockKafkaSchema.doc()).thenReturn(testDoc); TableInfo tableInfo = schemaManager - .constructTableInfo(tableId, fakeBigQuerySchema, testDoc); + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, false); + + Assert.assertEquals("Kafka doc does not match BigQuery table description", + testDoc, tableInfo.getDescription()); + StandardTableDefinition definition = tableInfo.getDefinition(); + Assert.assertNull("The clustering object should be null", definition.getClustering()); + } + + @Test + public void testUpdateClusteringPartitionNotSet() { + Optional timestampPartitionFieldName = Optional.of("testField"); + Optional> testField = Optional.of(Arrays.asList("column1", "column2")); + SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, + mockBigQuery, false, false, false, false, Optional.empty(), Optional.empty(), timestampPartitionFieldName, + Optional.empty(), testField, Optional.of(TimePartitioning.Type.DAY)); + + when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema); + when(mockKafkaSchema.doc()).thenReturn(testDoc); + + TableInfo tableInfo = schemaManager + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, true); Assert.assertEquals("Kafka doc does not match BigQuery table description", testDoc, tableInfo.getDescription()); StandardTableDefinition definition = tableInfo.getDefinition(); Assert.assertNotNull(definition.getClustering()); - Assert.assertEquals("The field name does not match the field name of time partition", + Assert.assertEquals("The field name should not match the field name of time partition", testField.get(), definition.getClustering().getFields()); + + Optional> updateTestField = Optional.of(Arrays.asList("column3", "column4")); + schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter, + mockBigQuery, false, false, false, false, Optional.empty(), Optional.empty(), timestampPartitionFieldName, + Optional.empty(), updateTestField, Optional.of(TimePartitioning.Type.DAY)); + + tableInfo = schemaManager + .constructTableInfo(tableId, fakeBigQuerySchema, testDoc, false); + definition = tableInfo.getDefinition(); + Assert.assertNull("The clustering object should be null", definition.getClustering()); + } + + @Test + public void testSuccessfulUpdateWithOnlyRelaxedFields() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema relaxedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(false, true, false); + + testGetAndValidateProposedSchema(schemaManager, existingSchema, relaxedSchema, relaxedSchema); + } + + @Test(expected = BigQueryConnectException.class) + public void testDisallowedUpdateWithOnlyRelaxedFields() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema relaxedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, false, false); + + testGetAndValidateProposedSchema(schemaManager, existingSchema, relaxedSchema, null); + } + + @Test + public void testSuccessfulUpdateWithOnlyNewFields() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema expandedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, false, false); + + testGetAndValidateProposedSchema(schemaManager, existingSchema, expandedSchema, expandedSchema); + } + + @Test(expected = BigQueryConnectException.class) + public void testDisallowedUpdateWithOnlyNewFields() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema expandedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(false, true, false); + + testGetAndValidateProposedSchema(schemaManager, existingSchema, expandedSchema, null); + } + + @Test(expected = BigQueryConnectException.class) + public void testDisallowedUpdateWithOnlyNewRequiredFields() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema expandedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.REQUIRED).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, false, false); + + testGetAndValidateProposedSchema(schemaManager, existingSchema, expandedSchema, null); + } + + @Test + public void testSuccessfulUpdateWithNewAndRelaxedFields() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema expandedAndRelaxedSchema = com.google.cloud.bigquery.Schema.of( + // Relax an existing field from required to nullable + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.NULLABLE).build(), + // Add a new nullable field + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.NULLABLE).build(), + // Add a new required field (that should be relaxed to nullable automatically) + Field.newBuilder("f3", LegacySQLTypeName.NUMERIC).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema expectedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("f3", LegacySQLTypeName.NUMERIC).setMode(Field.Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, true, false); + + testGetAndValidateProposedSchema + (schemaManager, existingSchema, expandedAndRelaxedSchema, expectedSchema); + } + + @Test + public void testSuccessfulUnionizedUpdateWithNewAndRelaxedFields() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema disjointSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema expectedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, true, true); + + testGetAndValidateProposedSchema(schemaManager, existingSchema, disjointSchema, expectedSchema); + } + + @Test + public void testSuccessfulUnionizedUpdateWithNewRepeatedField() { + com.google.cloud.bigquery.Schema reducedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema expandedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.REPEATED).build() + ); + + com.google.cloud.bigquery.Schema expectedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.REPEATED).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, true, true); + + // Unionization should work symmetrically, so test both cases of reduced/expanded as the current/new schemas + testGetAndValidateProposedSchema(schemaManager, reducedSchema, expandedSchema, expectedSchema); + testGetAndValidateProposedSchema(schemaManager, expandedSchema, reducedSchema, expectedSchema); + } + + @Test + public void testSuccessfulUpdateWithNewRepeatedField() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema expandedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.REPEATED).build() + ); + + com.google.cloud.bigquery.Schema expectedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.REPEATED).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, true, false); + + testGetAndValidateProposedSchema(schemaManager, existingSchema, expandedSchema, expectedSchema); } + + @Test(expected = BigQueryConnectException.class) + public void testDisallowedUnionizedUpdateWithNewField() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema expandedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(false, true, true); + + testGetAndValidateProposedSchema(schemaManager, existingSchema, expandedSchema, null); + } + + @Test(expected = BigQueryConnectException.class) + public void testDisallowedUnionizedUpdateWithRelaxedField() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema expandedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, false, true); + + testGetAndValidateProposedSchema(schemaManager, existingSchema, expandedSchema, null); + } + + @Test + public void testUnionizedUpdateWithMultipleSchemas() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema firstNewSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.NULLABLE).build() + ); + com.google.cloud.bigquery.Schema secondNewSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.REQUIRED).build() + ); + com.google.cloud.bigquery.Schema thirdNewSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + List newSchemas = + Arrays.asList(firstNewSchema, secondNewSchema, thirdNewSchema); + + com.google.cloud.bigquery.Schema expectedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, true, true); + + testGetAndValidateProposedSchema(schemaManager, existingSchema, newSchemas, expectedSchema); + } + + @Test + public void FieldsWithUnspecifiedModeShouldNotCauseNpe() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).build() + ); + + com.google.cloud.bigquery.Schema expandedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).build() + ); + + com.google.cloud.bigquery.Schema expectedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Mode.NULLABLE).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, true, true); + + testGetAndValidateProposedSchema(schemaManager, existingSchema, expandedSchema, expectedSchema); + } + + @Test + public void testFieldNamesSanitizedNoExistingSchema() { + BigQuerySchemaConverter converter = new BigQuerySchemaConverter(false, true); + + Schema kafkaSchema = SchemaBuilder.struct() + .field("f 1", Schema.BOOLEAN_SCHEMA) + .field("f 2", Schema.INT32_SCHEMA) + .build(); + com.google.cloud.bigquery.Schema expectedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f_1", LegacySQLTypeName.BOOLEAN).setMode(Mode.REQUIRED).build(), + Field.newBuilder("f_2", LegacySQLTypeName.INTEGER).setMode(Mode.REQUIRED).build() + ); + + SchemaManager schemaManager = createSchemaManager(false, false, false, true, converter); + testGetAndValidateProposedSchema(schemaManager, null, + null, expectedSchema, + Collections.singletonList(recordWithValueSchema(kafkaSchema))); + } + + @Test + public void testFieldNameSanitizedNewFields() { + BigQuerySchemaConverter converter = new BigQuerySchemaConverter(false, true); + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Mode.REQUIRED).build() + ); + Schema kafkaSchema = SchemaBuilder.struct() + .field("f1", Schema.BOOLEAN_SCHEMA) + .field("f 1", Schema.BOOLEAN_SCHEMA) + .field("f 2", Schema.INT32_SCHEMA) + .build(); + com.google.cloud.bigquery.Schema expectedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Mode.REQUIRED).build(), + Field.newBuilder("f_1", LegacySQLTypeName.BOOLEAN).setMode(Mode.NULLABLE).build(), + Field.newBuilder("f_2", LegacySQLTypeName.INTEGER).setMode(Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, true, false, true, converter); + testGetAndValidateProposedSchema(schemaManager, existingSchema, + null, expectedSchema, + Collections.singletonList(recordWithValueSchema(kafkaSchema))); + } + + @Test + public void testFieldNamesSanitizedUnionizedFields() { + BigQuerySchemaConverter converter = new BigQuerySchemaConverter(false, true); + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Mode.REQUIRED).build() + ); + Schema kafkaSchema = SchemaBuilder.struct() + .field("f 1", Schema.BOOLEAN_SCHEMA) + .field("f 2", Schema.INT32_SCHEMA) + .build(); + com.google.cloud.bigquery.Schema expectedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Mode.NULLABLE).build(), + Field.newBuilder("f_1", LegacySQLTypeName.BOOLEAN).setMode(Mode.NULLABLE).build(), + Field.newBuilder("f_2", LegacySQLTypeName.INTEGER).setMode(Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, true, true, true, converter); + testGetAndValidateProposedSchema(schemaManager, existingSchema, + null, expectedSchema, + Collections.singletonList(recordWithValueSchema(kafkaSchema))); + } + + @Test + public void testFieldNamesSanitizedFieldRelaxation() { + BigQuerySchemaConverter converter = new BigQuerySchemaConverter(false, true); + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f_1", LegacySQLTypeName.BOOLEAN).setMode(Mode.REQUIRED).build() + ); + Schema kafkaSchema = SchemaBuilder.struct() + .field("f 1", Schema.OPTIONAL_BOOLEAN_SCHEMA) + .build(); + com.google.cloud.bigquery.Schema expectedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f_1", LegacySQLTypeName.BOOLEAN).setMode(Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, true, false, true, converter); + testGetAndValidateProposedSchema(schemaManager, existingSchema, + null, expectedSchema, + Collections.singletonList(recordWithValueSchema(kafkaSchema))); + } + + @Test + public void testUpdateWithOnlyTombstoneRecordsAndExistingSchema() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, false, false); + List incomingSinkRecords = Collections.nCopies(2, recordWithValueSchema(null)); + // Tombstone records are skipped, and existing schema is reused. + testGetAndValidateProposedSchema(schemaManager, existingSchema, + Collections.singletonList(existingSchema), existingSchema, incomingSinkRecords); + } + + @Test(expected = BigQueryConnectException.class) + public void testUpdateWithOnlyTombstoneRecordsNoExistingSchema() { + SchemaManager schemaManager = createSchemaManager(true, false, false); + List incomingSinkRecords = Collections.nCopies(2, recordWithValueSchema(null)); + testGetAndValidateProposedSchema( + schemaManager, null, Collections.singletonList(null), null, incomingSinkRecords); + } + + @Test + public void testUpdateWithRegularAndTombstoneRecords() { + com.google.cloud.bigquery.Schema existingSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build() + ); + + com.google.cloud.bigquery.Schema expandedSchema = com.google.cloud.bigquery.Schema.of( + Field.newBuilder("f1", LegacySQLTypeName.BOOLEAN).setMode(Field.Mode.REQUIRED).build(), + Field.newBuilder("f2", LegacySQLTypeName.INTEGER).setMode(Field.Mode.NULLABLE).build() + ); + + SchemaManager schemaManager = createSchemaManager(true, false, false); + // Put tombstone at the end of the batch. + List incomingSinkRecords = ImmutableList.of( + recordWithValueSchema(mockKafkaSchema), recordWithValueSchema(null)); + // Tombstone record is skipped when converting schema. + testGetAndValidateProposedSchema(schemaManager, existingSchema, + Collections.singletonList(expandedSchema), expandedSchema, incomingSinkRecords); + } + + @Test + public void testGetUnionizedTableDescriptionFromTombstoneRecord() { + SchemaManager schemaManager = createSchemaManager(false, true, true); + SinkRecord tombstone = recordWithValueSchema(null); + List incomingSinkRecords = ImmutableList.of(tombstone); + Assert.assertNull(schemaManager.getUnionizedTableDescription(incomingSinkRecords)); + } + + @Test + public void testGetUnionizedTableDescriptionFromRegularAndNullRecords() { + SchemaManager schemaManager = createSchemaManager(false, true, true).forIntermediateTables(); + List incomingSinkRecords = ImmutableList.of( + recordWithValueSchema(mockKafkaSchema), recordWithValueSchema(null)); + when(mockKafkaSchema.doc()).thenReturn(testDoc); + Assert.assertNotNull(schemaManager.getUnionizedTableDescription(incomingSinkRecords)); + } + + private SchemaManager createSchemaManager( + boolean allowNewFields, boolean allowFieldRelaxation, boolean allowUnionization, boolean sanitizeFieldNames, SchemaConverter converter) { + return new SchemaManager(new IdentitySchemaRetriever(), converter, mockBigQuery, + allowNewFields, allowFieldRelaxation, allowUnionization, sanitizeFieldNames, + Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), + Optional.of(TimePartitioning.Type.DAY)); + } + + private SchemaManager createSchemaManager( + boolean allowNewFields, boolean allowFieldRelaxation, boolean allowUnionization) { + return new SchemaManager(new IdentitySchemaRetriever(), mockSchemaConverter, mockBigQuery, + allowNewFields, allowFieldRelaxation, allowUnionization, false, + Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), + Optional.of(TimePartitioning.Type.DAY)); + } + + private void testGetAndValidateProposedSchema( + SchemaManager schemaManager, + com.google.cloud.bigquery.Schema existingSchema, + com.google.cloud.bigquery.Schema newSchema, + com.google.cloud.bigquery.Schema expectedSchema) { + testGetAndValidateProposedSchema( + schemaManager, existingSchema, Collections.singletonList(newSchema), expectedSchema); + } + + private void testGetAndValidateProposedSchema( + SchemaManager schemaManager, + com.google.cloud.bigquery.Schema existingSchema, + List newSchemas, + com.google.cloud.bigquery.Schema expectedSchema) { + testGetAndValidateProposedSchema(schemaManager, existingSchema, newSchemas, expectedSchema, + Collections.nCopies(newSchemas.size(), recordWithValueSchema(mockKafkaSchema))); + } + + private void testGetAndValidateProposedSchema( + SchemaManager schemaManager, + com.google.cloud.bigquery.Schema existingSchema, + List newSchemas, + com.google.cloud.bigquery.Schema expectedSchema, + List incomingSinkRecords) { + Table existingTable = existingSchema != null ? tableWithSchema(existingSchema) : null; + when(mockBigQuery.getTable(tableId)).thenReturn(existingTable); + + if (newSchemas != null) { + OngoingStubbing converterStub = + when(mockSchemaConverter.convertSchema(mockKafkaSchema)); + for (com.google.cloud.bigquery.Schema newSchema : newSchemas) { + // The converter will return the schemas in the order that they are provided to it with the + // call to "thenReturn" + converterStub = converterStub.thenReturn(newSchema); + } + } + + com.google.cloud.bigquery.Schema proposedSchema = + schemaManager.getAndValidateProposedSchema(tableId, incomingSinkRecords); + + if (expectedSchema != null) { + assertEquals(expectedSchema, proposedSchema); + } + } + + private Table tableWithSchema(com.google.cloud.bigquery.Schema schema) { + TableDefinition definition = mock(TableDefinition.class); + when(definition.getSchema()).thenReturn(schema); + + Table result = mock(Table.class); + when(result.getDefinition()).thenReturn(definition); + + return result; + } + + private SinkRecord recordWithValueSchema(Schema valueSchema) { + SinkRecord result = mock(SinkRecord.class); + when(result.valueSchema()).thenReturn(valueSchema); + return result; + } + + @Test + public void testUnionizeSchemaNoNestedOrRepeatedRecords() { + com.google.cloud.bigquery.Schema s1 = com.google.cloud.bigquery.Schema.of( + Field.of(LegacySQLTypeName.BYTES.name(), LegacySQLTypeName.BYTES), + Field.of(LegacySQLTypeName.STRING.name(), LegacySQLTypeName.STRING), + Field.of(LegacySQLTypeName.DATE.name(), LegacySQLTypeName.DATE) + ); + com.google.cloud.bigquery.Schema s2 = com.google.cloud.bigquery.Schema.of( + Field.of(LegacySQLTypeName.TIMESTAMP.name(), LegacySQLTypeName.TIMESTAMP), + Field.of(LegacySQLTypeName.FLOAT.name(), LegacySQLTypeName.FLOAT) + ); + + List expectedFields = new ArrayList<>(); + expectedFields.addAll(s1.getFields()); + expectedFields.addAll(s2.getFields()); + + assertUnion(makeNullable(com.google.cloud.bigquery.Schema.of(expectedFields)), s1, s2); + } + + @Test + public void testUnionizeSchemaWithNestedRecords() { + com.google.cloud.bigquery.Schema s1 = com.google.cloud.bigquery.Schema.of( + Field.of(LegacySQLTypeName.RECORD.name(), LegacySQLTypeName.RECORD, + Field.of(LegacySQLTypeName.STRING.name(), LegacySQLTypeName.STRING), + Field.of(LegacySQLTypeName.DATE.name(), LegacySQLTypeName.DATE) + )); + com.google.cloud.bigquery.Schema s2 = com.google.cloud.bigquery.Schema.of( + Field.of(LegacySQLTypeName.RECORD.name(), LegacySQLTypeName.RECORD, + Field.of(LegacySQLTypeName.TIMESTAMP.name(), LegacySQLTypeName.TIMESTAMP) + )); + com.google.cloud.bigquery.Schema expected = com.google.cloud.bigquery.Schema.of( + Field.of(LegacySQLTypeName.RECORD.name(), LegacySQLTypeName.RECORD, + Field.of(LegacySQLTypeName.STRING.name(), LegacySQLTypeName.STRING), + Field.of(LegacySQLTypeName.DATE.name(), LegacySQLTypeName.DATE), + Field.of(LegacySQLTypeName.TIMESTAMP.name(), LegacySQLTypeName.TIMESTAMP) + ) + ); + assertUnion(makeNullable(expected), s1, s2); + } + + @Test + public void testUnionizeSchemaWithNestedAndRepeatedFields() { + com.google.cloud.bigquery.Schema s1 = com.google.cloud.bigquery.Schema.of( + Field.of(LegacySQLTypeName.RECORD.name(), LegacySQLTypeName.RECORD, + Field.newBuilder(LegacySQLTypeName.STRING.name(), LegacySQLTypeName.STRING).setMode(Mode.REPEATED).build(), + Field.of(LegacySQLTypeName.RECORD.name(), LegacySQLTypeName.RECORD, + Field.of(LegacySQLTypeName.BYTES.name(), LegacySQLTypeName.BYTES) + ) + ) + ); + com.google.cloud.bigquery.Schema s2 = com.google.cloud.bigquery.Schema.of( + Field.of(LegacySQLTypeName.RECORD.name(), LegacySQLTypeName.RECORD, + Field.newBuilder(LegacySQLTypeName.DATE.name(), LegacySQLTypeName.DATE).setMode(Mode.REPEATED).build(), + Field.of(LegacySQLTypeName.RECORD.name(), LegacySQLTypeName.RECORD, + Field.of(LegacySQLTypeName.FLOAT.name(), LegacySQLTypeName.FLOAT) + ) + ) + ); + + com.google.cloud.bigquery.Schema expected = com.google.cloud.bigquery.Schema.of( + Field.of(LegacySQLTypeName.RECORD.name(), LegacySQLTypeName.RECORD, + Field.newBuilder(LegacySQLTypeName.STRING.name(), LegacySQLTypeName.STRING).setMode(Mode.REPEATED).build(), + Field.of(LegacySQLTypeName.RECORD.name(), LegacySQLTypeName.RECORD, + Field.of(LegacySQLTypeName.BYTES.name(), LegacySQLTypeName.BYTES), + Field.of(LegacySQLTypeName.FLOAT.name(), LegacySQLTypeName.FLOAT) + ), + Field.newBuilder(LegacySQLTypeName.DATE.name(), LegacySQLTypeName.DATE).setMode(Mode.REPEATED).build() + ) + ); + assertUnion(makeNullable(expected), s1, s2); + } + + @Test + public void testUnionizeSchemaNestedRelax() { + com.google.cloud.bigquery.Schema s1 = com.google.cloud.bigquery.Schema.of( + Field.of(LegacySQLTypeName.RECORD.name(), LegacySQLTypeName.RECORD, + Field.newBuilder(LegacySQLTypeName.STRING.name(), LegacySQLTypeName.STRING).setMode(Mode.REQUIRED).build() + ) + ); + com.google.cloud.bigquery.Schema s2 = com.google.cloud.bigquery.Schema.of( + Field.of(LegacySQLTypeName.RECORD.name(), LegacySQLTypeName.RECORD, + Field.newBuilder(LegacySQLTypeName.STRING.name(), LegacySQLTypeName.STRING).setMode(Mode.NULLABLE).build() + ) + ); + assertUnion(makeNullable(s2), s1, s2); + } + + @Test + public void testFieldNameSanitizedOnCreateTable() { + Schema embeddedStructWithInvalidFieldName = SchemaBuilder.struct() + .field("embedded-invalid", Schema.INT32_SCHEMA) + .build(); + Schema schemaWithInvalidFieldNames = SchemaBuilder.struct() + .field("1st field", Schema.BOOLEAN_SCHEMA) + .field("second-field", Schema.STRING_SCHEMA) + .field("embedded", embeddedStructWithInvalidFieldName) + .build(); + + List incomingSinkRecords = Collections.nCopies(2, recordWithValueSchema(schemaWithInvalidFieldNames)); + BigQuerySchemaConverter converter = new BigQuerySchemaConverter(true, true); + + SchemaManager schemaManager = createSchemaManager(false, false, false, true, converter); + schemaManager.createTable(tableId, incomingSinkRecords); + + ArgumentCaptor tableInfoCaptor = ArgumentCaptor.forClass(TableInfo.class); + + verify(mockBigQuery).create(tableInfoCaptor.capture()); + + com.google.cloud.bigquery.Schema actualSchema = tableInfoCaptor.getValue().getDefinition().getSchema(); + for (org.apache.kafka.connect.data.Field field : schemaWithInvalidFieldNames.fields()) { + String sanitizedName = FieldNameSanitizer.sanitizeName(field.name()); + assertEquals(sanitizedName, actualSchema.getFields().get(sanitizedName).getName()); + } + assertEquals("embedded_invalid", actualSchema.getFields().get("embedded").getSubFields().get(0).getName()); + } + + private com.google.cloud.bigquery.Schema makeNullable(com.google.cloud.bigquery.Schema s) { + return com.google.cloud.bigquery.Schema.of( + s.getFields().stream() + .map(this::makeNullable) + .collect(Collectors.toList()) + ); + } + + private Field makeNullable(Field f) { + Field.Builder builder = f.toBuilder(); + if (f.getSubFields() != null) { + List subFields = f.getSubFields().stream() + .map(this::makeNullable) + .collect(Collectors.toList()); + builder.setType(LegacySQLTypeName.RECORD, subFields.toArray(new Field[]{})).build(); + } + return builder + .setMode(f.getMode() == Mode.REPEATED ? Mode.REPEATED : Mode.NULLABLE) + .build(); + } + + private void assertUnion(com.google.cloud.bigquery.Schema expected, + com.google.cloud.bigquery.Schema schema1, com.google.cloud.bigquery.Schema schema2) { + SchemaManager sm = createSchemaManager(true, true, true); + assertEquals( + expected, sm.unionizeSchemas(schema1, schema2) + ); + } + } diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SinkConnectorPropertiesFactory.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SinkConnectorPropertiesFactory.java deleted file mode 100644 index e47dd8ac2..000000000 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SinkConnectorPropertiesFactory.java +++ /dev/null @@ -1,45 +0,0 @@ -package com.wepay.kafka.connect.bigquery; - -/* - * Copyright 2016 WePay, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - -import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; - -import java.util.Map; - -public class SinkConnectorPropertiesFactory extends SinkPropertiesFactory { - @Override - public Map getProperties() { - Map properties = super.getProperties(); - - properties.put(BigQuerySinkConfig.TABLE_CREATE_CONFIG, "false"); - return properties; - } - - /** - * Make sure that each of the default configuration properties work nicely with the given - * configuration object. - * - * @param config The config object to test - */ - public void testProperties(BigQuerySinkConfig config) { - super.testProperties(config); - - config.getBoolean(config.TABLE_CREATE_CONFIG); - } -} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SinkPropertiesFactory.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SinkPropertiesFactory.java index 9e20cc73a..fde3567c7 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SinkPropertiesFactory.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SinkPropertiesFactory.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery; import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; @@ -48,25 +49,4 @@ public Map getProperties() { return properties; } - - /** - * Make sure that each of the default configuration properties work nicely with the given - * configuration object. - * - * @param config The config object to test - */ - public void testProperties(BigQuerySinkConfig config) { - - config.getList(config.TOPICS_CONFIG); - config.getString(config.DEFAULT_DATASET_CONFIG); - - config.getKeyFile(); - config.getString(config.PROJECT_CONFIG); - - config.getBoolean(config.SANITIZE_TOPICS_CONFIG); - config.getInt(config.AVRO_DATA_CACHE_SIZE_CONFIG); - - config.getBoolean(config.ALLOW_NEW_BIGQUERY_FIELDS_CONFIG); - config.getBoolean(config.ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG); - } } diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SinkTaskPropertiesFactory.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SinkTaskPropertiesFactory.java index 8e7b139af..286544338 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SinkTaskPropertiesFactory.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SinkTaskPropertiesFactory.java @@ -1,45 +1,17 @@ package com.wepay.kafka.connect.bigquery; -/* - * Copyright 2016 WePay, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - -import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; import com.wepay.kafka.connect.bigquery.config.BigQuerySinkTaskConfig; import java.util.Map; public class SinkTaskPropertiesFactory extends SinkPropertiesFactory { + @Override public Map getProperties() { Map properties = super.getProperties(); - properties.put(BigQuerySinkConfig.TABLE_CREATE_CONFIG, "false"); + properties.put(BigQuerySinkTaskConfig.TASK_ID_CONFIG, "1"); return properties; } - - /** - * Make sure that each of the default configuration properties work nicely with the given - * configuration object. - * - * @param config The config object to test - */ - public void testProperties(BigQuerySinkTaskConfig config) { - super.testProperties(config); - } } diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkConfigTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkConfigTest.java index 6e916fc0a..cb61dd640 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkConfigTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkConfigTest.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.config; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,22 +17,23 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.config; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - +import com.google.cloud.bigquery.TimePartitioning; import com.wepay.kafka.connect.bigquery.SinkPropertiesFactory; - import com.wepay.kafka.connect.bigquery.convert.BigQueryRecordConverter; import com.wepay.kafka.connect.bigquery.convert.BigQuerySchemaConverter; import org.apache.kafka.common.config.ConfigException; - import org.junit.Before; import org.junit.Test; -import java.util.HashMap; -import java.util.Map; +import java.util.*; + +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.CONNECTOR_RUNTIME_PROVIDER_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.CONNECTOR_RUNTIME_PROVIDER_DEFAULT; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; public class BigQuerySinkConfigTest { private SinkPropertiesFactory propertiesFactory; @@ -47,7 +48,11 @@ public void initializePropertiesFactory() { public void metaTestBasicConfigProperties() { Map basicConfigProperties = propertiesFactory.getProperties(); BigQuerySinkConfig config = new BigQuerySinkConfig(basicConfigProperties); - propertiesFactory.testProperties(config); + config.getList(BigQuerySinkConfig.TOPICS_CONFIG); + config.getString(BigQuerySinkConfig.PROJECT_CONFIG); + config.getKey(); + config.getBoolean(BigQuerySinkConfig.SANITIZE_TOPICS_CONFIG); + config.getInt(BigQuerySinkConfig.AVRO_DATA_CACHE_SIZE_CONFIG); } @Test @@ -81,4 +86,202 @@ public void testInvalidAvroCacheSize() { new BigQuerySinkConfig(badConfigProperties); } + + /** + * Test the default for the field name is not present. + */ + @Test + public void testEmptyTimestampPartitionFieldName() { + Map configProperties = propertiesFactory.getProperties(); + BigQuerySinkConfig testConfig = new BigQuerySinkConfig(configProperties); + assertFalse(testConfig.getTimestampPartitionFieldName().isPresent()); + } + + /** + * Test the field name being non-empty and the decorator set to false works correctly. + */ + @Test + public void testTimestampPartitionFieldName() { + Map configProperties = propertiesFactory.getProperties(); + configProperties.put(BigQuerySinkConfig.BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG, "name"); + configProperties.put(BigQuerySinkConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "false"); + BigQuerySinkConfig testConfig = new BigQuerySinkConfig(configProperties); + assertTrue(testConfig.getTimestampPartitionFieldName().isPresent()); + assertFalse(testConfig.getBoolean(BigQuerySinkConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG)); + } + + /** + * Test the default for the field names is not present. + */ + @Test + public void testEmptyClusteringFieldNames() { + Map configProperties = propertiesFactory.getProperties(); + BigQuerySinkConfig testConfig = new BigQuerySinkConfig(configProperties); + assertFalse(testConfig.getClusteringPartitionFieldNames().isPresent()); + } + + /** + * Test if the field names are more than four fields errors correctly. + */ + @Test (expected = ConfigException.class) + public void testClusteringPartitionFieldNamesWithMoreThanFourFieldsError() { + Map configProperties = propertiesFactory.getProperties(); + configProperties.put(BigQuerySinkConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "true"); + configProperties.put( + BigQuerySinkConfig.BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG, + "column1,column2,column3,column4,column5" + ); + new BigQuerySinkConfig(configProperties); + } + + /** + * Test the field names being non-empty and the partitioning field exists works correctly. + */ + @Test + public void testClusteringFieldNames() { + Map configProperties = propertiesFactory.getProperties(); + configProperties.put(BigQuerySinkConfig.BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG, "name"); + configProperties.put(BigQuerySinkConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "false"); + configProperties.put( + BigQuerySinkConfig.BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG, + "column1,column2" + ); + + ArrayList expectedClusteringPartitionFieldName = new ArrayList<>( + Arrays.asList("column1", "column2") + ); + + BigQuerySinkConfig testConfig = new BigQuerySinkConfig(configProperties); + Optional> testClusteringPartitionFieldName = testConfig.getClusteringPartitionFieldNames(); + assertTrue(testClusteringPartitionFieldName.isPresent()); + assertEquals(expectedClusteringPartitionFieldName, testClusteringPartitionFieldName.get()); + } + + /** + * Test the default for the partition expiration is not present. + */ + @Test + public void testEmptyPartitionExpirationMs() { + Map configProperties = propertiesFactory.getProperties(); + BigQuerySinkConfig testConfig = new BigQuerySinkConfig(configProperties); + assertFalse(testConfig.getPartitionExpirationMs().isPresent()); + } + + /** + * Test the partition expiration is set correctly for a valid value. + */ + @Test + public void testValidPartitionExpirationMs() { + Map configProperties = propertiesFactory.getProperties(); + configProperties.put(BigQuerySinkConfig.BIGQUERY_PARTITION_EXPIRATION_CONFIG, "1"); + BigQuerySinkConfig testConfig = new BigQuerySinkConfig(configProperties); + assertTrue(testConfig.getPartitionExpirationMs().isPresent()); + assertEquals(Optional.of(1L), testConfig.getPartitionExpirationMs()); + } + + /** + * Test the partition expiration being non-positive errors correctly. + */ + @Test (expected = ConfigException.class) + public void testMinimumPartitionExpirationMs() { + Map configProperties = propertiesFactory.getProperties(); + configProperties.put(BigQuerySinkConfig.BIGQUERY_PARTITION_EXPIRATION_CONFIG, "0"); + new BigQuerySinkConfig(configProperties); + } + + @Test + public void testValidTimePartitioningTypes() { + Map configProperties = propertiesFactory.getProperties(); + + for (TimePartitioning.Type type : TimePartitioning.Type.values()) { + configProperties.put(BigQuerySinkConfig.TIME_PARTITIONING_TYPE_CONFIG, type.name()); + Optional timePartitioningType = new BigQuerySinkConfig(configProperties).getTimePartitioningType(); + assertTrue(timePartitioningType.isPresent()); + assertEquals(type, timePartitioningType.get()); + } + + configProperties.put(BigQuerySinkConfig.TIME_PARTITIONING_TYPE_CONFIG, BigQuerySinkConfig.TIME_PARTITIONING_TYPE_NONE); + Optional timePartitioningType = new BigQuerySinkConfig(configProperties).getTimePartitioningType(); + assertEquals(Optional.empty(), timePartitioningType); + } + + @Test(expected = ConfigException.class) + public void testInvalidTimePartitioningType() { + Map configProperties = propertiesFactory.getProperties(); + + configProperties.put(BigQuerySinkConfig.TIME_PARTITIONING_TYPE_CONFIG, "fortnight"); + new BigQuerySinkConfig(configProperties); + } + + @Test + public void testKafkaProviderConfigDefaultValue() { + Map configProperties = propertiesFactory.getProperties(); + BigQuerySinkConfig config = new BigQuerySinkConfig(configProperties); + assertEquals(CONNECTOR_RUNTIME_PROVIDER_DEFAULT, config.getString(CONNECTOR_RUNTIME_PROVIDER_CONFIG)); + } + + @Test + public void testKafkaProviderConfig() { + Map configProperties = propertiesFactory.getProperties(); + String testKafkaProvider = "testProvider"; + configProperties.put(CONNECTOR_RUNTIME_PROVIDER_CONFIG, testKafkaProvider); + BigQuerySinkConfig config = new BigQuerySinkConfig(configProperties); + assertEquals(testKafkaProvider, config.getString(CONNECTOR_RUNTIME_PROVIDER_CONFIG)); + } + + @Test (expected = ConfigException.class) + public void testTopic2TableInvalidFormat() { + Map configProperties = propertiesFactory.getProperties(); + configProperties.put(BigQuerySinkConfig.TOPIC2TABLE_MAP_CONFIG, "topic:"); + new BigQuerySinkConfig(configProperties); + } + + @Test (expected = ConfigException.class) + public void testTopic2TableDuplicateTopic() { + Map configProperties = propertiesFactory.getProperties(); + configProperties.put(BigQuerySinkConfig.TOPIC2TABLE_MAP_CONFIG, "topic:table, topic:table2"); + new BigQuerySinkConfig(configProperties); + } + + @Test (expected = ConfigException.class) + public void testTopic2TableDuplicateTable() { + Map configProperties = propertiesFactory.getProperties(); + configProperties.put(BigQuerySinkConfig.TOPIC2TABLE_MAP_CONFIG, "topic:table, topic2:table"); + new BigQuerySinkConfig(configProperties); + } + + @Test (expected = ConfigException.class) + public void testTopic2TableSemicolonOnly() { + Map configProperties = propertiesFactory.getProperties(); + configProperties.put(BigQuerySinkConfig.TOPIC2TABLE_MAP_CONFIG, ":"); + new BigQuerySinkConfig(configProperties); + } + + @Test + public void testValidTopic2TableMap() { + Map configProperties = propertiesFactory.getProperties(); + configProperties.put(BigQuerySinkConfig.TOPIC2TABLE_MAP_CONFIG, "topic:table, topic2:table2"); + BigQuerySinkConfig config = new BigQuerySinkConfig(configProperties); + Map topic2TableMap = new HashMap<>(); + topic2TableMap.put("topic", "table"); + topic2TableMap.put("topic2", "table2"); + assertEquals(topic2TableMap, config.getTopic2TableMap().get()); + } + + @Test + public void testTopic2TableEmptyString() { + Map configProperties = propertiesFactory.getProperties(); + configProperties.put(BigQuerySinkConfig.TOPIC2TABLE_MAP_CONFIG, ""); + BigQuerySinkConfig config = new BigQuerySinkConfig(configProperties); + assertFalse(config.getTopic2TableMap().isPresent()); + } + + @Test + public void testTopic2TableCommaOnly() { + Map configProperties = propertiesFactory.getProperties(); + configProperties.put(BigQuerySinkConfig.TOPIC2TABLE_MAP_CONFIG, ","); + BigQuerySinkConfig config = new BigQuerySinkConfig(configProperties); + assertFalse(config.getTopic2TableMap().isPresent()); + } + } diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkTaskConfigTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkTaskConfigTest.java deleted file mode 100644 index f751afec5..000000000 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkTaskConfigTest.java +++ /dev/null @@ -1,175 +0,0 @@ -package com.wepay.kafka.connect.bigquery.config; - -/* - * Copyright 2016 WePay, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import com.wepay.kafka.connect.bigquery.SinkTaskPropertiesFactory; - -import org.apache.kafka.common.config.ConfigException; - -import org.junit.Before; -import org.junit.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Optional; - -public class BigQuerySinkTaskConfigTest { - private SinkTaskPropertiesFactory propertiesFactory; - - @Before - public void initializePropertiesFactory() { - propertiesFactory = new SinkTaskPropertiesFactory(); - } - - @Test - public void metaTestBasicConfigProperties() { - Map basicConfigProperties = propertiesFactory.getProperties(); - BigQuerySinkTaskConfig config = new BigQuerySinkTaskConfig(basicConfigProperties); - propertiesFactory.testProperties(config); - } - - @Test() - public void testMaxWriteSize() { - // todo: something like this, maybe. - /* - Map badProperties = propertiesFactory.getProperties(); - badProperties.put(BigQuerySinkTaskConfig.MAX_WRITE_CONFIG, "-1"); - - try { - new BigQuerySinkTaskConfig(badProperties); - } catch (ConfigException err) { - fail("Exception encountered before addition of bad configuration field: " + err); - } - - badProperties.put(BigQuerySinkTaskConfig.MAX_WRITE_CONFIG, "0"); - new BigQuerySinkTaskConfig(badProperties); - */ - } - - /** - * Test the default for the field name is not present. - */ - @Test - public void testEmptyTimestampPartitionFieldName() { - Map configProperties = propertiesFactory.getProperties(); - BigQuerySinkTaskConfig testConfig = new BigQuerySinkTaskConfig(configProperties); - assertFalse(testConfig.getTimestampPartitionFieldName().isPresent()); - } - - /** - * Test if the field name being non-empty and the decorator default (true) errors correctly. - */ - @Test (expected = ConfigException.class) - public void testTimestampPartitionFieldNameError() { - Map configProperties = propertiesFactory.getProperties(); - configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG, "name"); - new BigQuerySinkTaskConfig(configProperties); - } - - /** - * Test the field name being non-empty and the decorator set to false works correctly. - */ - @Test - public void testTimestampPartitionFieldName() { - Map configProperties = propertiesFactory.getProperties(); - configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG, "name"); - configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "false"); - BigQuerySinkTaskConfig testConfig = new BigQuerySinkTaskConfig(configProperties); - assertTrue(testConfig.getTimestampPartitionFieldName().isPresent()); - assertFalse(testConfig.getBoolean(BigQuerySinkTaskConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG)); - } - - /** - * Test the default for the field names is not present. - */ - @Test - public void testEmptyClusteringFieldNames() { - Map configProperties = propertiesFactory.getProperties(); - BigQuerySinkTaskConfig testConfig = new BigQuerySinkTaskConfig(configProperties); - assertFalse(testConfig.getClusteringPartitionFieldName().isPresent()); - } - - /** - * Test if the field names being non-empty and the partitioning is not present errors correctly. - */ - @Test (expected = ConfigException.class) - public void testClusteringFieldNamesWithoutTimestampPartitionError() { - Map configProperties = propertiesFactory.getProperties(); - configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG, null); - configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "false"); - configProperties.put( - BigQuerySinkTaskConfig.BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG, - "column1,column2" - ); - new BigQuerySinkTaskConfig(configProperties); - } - - /** - * Test if the field names are more than four fields errors correctly. - */ - @Test (expected = ConfigException.class) - public void testClusteringPartitionFieldNamesWithMoreThanFourFieldsError() { - Map configProperties = propertiesFactory.getProperties(); - configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "true"); - configProperties.put( - BigQuerySinkTaskConfig.BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG, - "column1,column2,column3,column4,column5" - ); - new BigQuerySinkTaskConfig(configProperties); - } - - /** - * Test the field names being non-empty and the partitioning field exists works correctly. - */ - @Test - public void testClusteringFieldNames() { - Map configProperties = propertiesFactory.getProperties(); - configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG, "name"); - configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "false"); - configProperties.put( - BigQuerySinkTaskConfig.BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG, - "column1,column2" - ); - - ArrayList expectedClusteringPartitionFieldName = new ArrayList<>( - Arrays.asList("column1", "column2") - ); - - BigQuerySinkTaskConfig testConfig = new BigQuerySinkTaskConfig(configProperties); - Optional> testClusteringPartitionFieldName = testConfig.getClusteringPartitionFieldName(); - assertTrue(testClusteringPartitionFieldName.isPresent()); - assertEquals(expectedClusteringPartitionFieldName, testClusteringPartitionFieldName.get()); - } - - @Test(expected = ConfigException.class) - public void testSchemaUpdatesWithoutRetriever() { - Map badConfigProperties = propertiesFactory.getProperties(); - badConfigProperties.remove(BigQuerySinkTaskConfig.SCHEMA_RETRIEVER_CONFIG); - badConfigProperties.put(BigQuerySinkTaskConfig.ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG, "true"); - badConfigProperties.put(BigQuerySinkTaskConfig.ALLOW_NEW_BIGQUERY_FIELDS_CONFIG, "true"); - - new BigQuerySinkTaskConfig(badConfigProperties); - } -} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/CredentialsValidatorTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/CredentialsValidatorTest.java new file mode 100644 index 000000000..95ba68d54 --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/CredentialsValidatorTest.java @@ -0,0 +1,82 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.config; + +import com.wepay.kafka.connect.bigquery.GcpClientBuilder; +import org.junit.Test; + +import java.util.Optional; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class CredentialsValidatorTest { + + @Test + public void testNoCredentialsSkipsValidation() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getKey()).thenReturn(null); + + assertEquals( + Optional.empty(), + new CredentialsValidator.BigQueryCredentialsValidator().doValidate(config) + ); + assertEquals( + Optional.empty(), + new CredentialsValidator.GcsCredentialsValidator().doValidate(config) + ); + } + + @Test + public void testFailureToConstructClient() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getKey()).thenReturn("key"); + + @SuppressWarnings("unchecked") + GcpClientBuilder mockClientBuilder = mock(GcpClientBuilder.class); + when(mockClientBuilder.withConfig(eq(config))).thenReturn(mockClientBuilder); + when(mockClientBuilder.build()).thenThrow(new RuntimeException("Provided credentials are invalid")); + + assertNotEquals( + Optional.empty(), + new CredentialsValidator.BigQueryCredentialsValidator().doValidate(config) + ); + assertNotEquals( + Optional.empty(), + new CredentialsValidator.GcsCredentialsValidator().doValidate(config) + ); + } + + @Test + public void testKeyShouldNotBeProvidedIfUsingApplicationDefaultCredentials() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getKey()).thenReturn("key"); + when(config.getKeySource()).thenReturn(GcpClientBuilder.KeySource.APPLICATION_DEFAULT); + + assertTrue( + new CredentialsValidator.BigQueryCredentialsValidator().doValidate(config) + .get().contains("should not be provided") + ); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/GcsBucketValidatorTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/GcsBucketValidatorTest.java new file mode 100644 index 000000000..b8c80fee4 --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/GcsBucketValidatorTest.java @@ -0,0 +1,122 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.config; + +import com.google.cloud.storage.Bucket; +import com.google.cloud.storage.Storage; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.junit.MockitoJUnitRunner; + +import java.util.Collections; +import java.util.Optional; + +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.ENABLE_BATCH_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.GCS_BUCKET_NAME_CONFIG; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +@RunWith(MockitoJUnitRunner.class) +public class GcsBucketValidatorTest { + + @Mock + private Storage gcs; + + @Test + public void testNullBatchLoadingSkipsValidation() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getList(ENABLE_BATCH_CONFIG)).thenReturn(null); + + assertEquals( + Optional.empty(), + new GcsBucketValidator().doValidate(gcs, config) + ); + } + + @Test + public void testEmptyBatchLoadingSkipsValidation() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getList(ENABLE_BATCH_CONFIG)).thenReturn(Collections.emptyList()); + + assertEquals( + Optional.empty(), + new GcsBucketValidator().doValidate(gcs, config) + ); + } + + @Test + public void testNullBucketWithBatchLoading() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getList(ENABLE_BATCH_CONFIG)).thenReturn(Collections.singletonList("t1")); + when(config.getString(GCS_BUCKET_NAME_CONFIG)).thenReturn(null); + + assertNotEquals( + Optional.empty(), + new GcsBucketValidator().doValidate(gcs, config) + ); + } + + @Test + public void testBlankBucketWithBatchLoading() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getList(ENABLE_BATCH_CONFIG)).thenReturn(Collections.singletonList("t1")); + when(config.getString(GCS_BUCKET_NAME_CONFIG)).thenReturn(" \t "); + + assertNotEquals( + Optional.empty(), + new GcsBucketValidator().doValidate(gcs, config) + ); + } + + @Test + public void testValidBucketWithBatchLoading() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + final String bucketName = "gee_cs"; + when(config.getList(ENABLE_BATCH_CONFIG)).thenReturn(Collections.singletonList("t1")); + when(config.getString(GCS_BUCKET_NAME_CONFIG)).thenReturn(bucketName); + + Bucket bucket = mock(Bucket.class); + when(gcs.get(eq(bucketName))).thenReturn(bucket); + + assertEquals( + Optional.empty(), + new GcsBucketValidator().doValidate(gcs, config) + ); + } + + @Test + public void testMissingBucketAndBucketCreationDisabledWithBatchLoading() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + final String bucketName = "gee_cs"; + when(config.getList(ENABLE_BATCH_CONFIG)).thenReturn(Collections.singletonList("t1")); + when(config.getString(GCS_BUCKET_NAME_CONFIG)).thenReturn(bucketName); + + when(gcs.get(eq(bucketName))).thenReturn(null); + + assertNotEquals( + Optional.empty(), + new GcsBucketValidator().doValidate(gcs, config) + ); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/MultiPropertyValidatorTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/MultiPropertyValidatorTest.java new file mode 100644 index 000000000..205bb56a3 --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/MultiPropertyValidatorTest.java @@ -0,0 +1,138 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.config; + +import com.google.common.collect.ImmutableMap; +import org.apache.kafka.common.config.ConfigValue; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.fail; + +public class MultiPropertyValidatorTest { + + private static class TestValidator extends MultiPropertyValidator { + + private final List dependents; + private final Function> validationFunction; + + public TestValidator(String propertyName, List dependents, Function> validationFunction) { + super(propertyName); + this.dependents = dependents; + this.validationFunction = validationFunction; + } + + @Override + protected Collection dependents() { + return dependents; + } + + @Override + protected Optional doValidate(Config config) { + return validationFunction.apply(config); + } + } + + @Test + public void testExistingErrorSkipsValidation() { + MultiPropertyValidator validator = new TestValidator<>( + "p", + Arrays.asList("d1", "d2", "d3"), + o -> { + fail("Validation should have been performed on property that already has an error"); + return null; + } + ); + + ConfigValue configValue = new ConfigValue("p", "v", Collections.emptyList(), Collections.singletonList("an error")); + + assertEquals( + Optional.empty(), + validator.validate(configValue, null, Collections.emptyMap()) + ); + } + + @Test + public void testDependentErrorSkipsValidation() { + MultiPropertyValidator validator = new TestValidator<>( + "p", + Arrays.asList("d1", "d2", "d3"), + o -> { + fail("Validation should have been performed on property whose dependent already has an error"); + return null; + } + ); + + ConfigValue configValue = new ConfigValue("p", "v", Collections.emptyList(), Collections.emptyList()); + Map valuesByName = ImmutableMap.of( + "d1", new ConfigValue("d1", "v1", Collections.emptyList(), Collections.emptyList()), + "d2", new ConfigValue("d2", "v1", Collections.emptyList(), Collections.singletonList("an error")) + ); + + assertEquals( + Optional.empty(), + validator.validate(configValue, null, valuesByName) + ); + } + + @Test + public void testValidationFails() { + Optional expectedError = Optional.of("an error"); + MultiPropertyValidator validator = new TestValidator<>( + "p", + Collections.emptyList(), + o -> expectedError + ); + + ConfigValue configValue = new ConfigValue("p", "v", Collections.emptyList(), Collections.emptyList()); + + assertEquals( + expectedError, + validator.validate(configValue, null, Collections.emptyMap()) + ); + } + + @Test + public void testUnexpectedErrorDuringValidation() { + MultiPropertyValidator validator = new TestValidator<>( + "p", + Collections.emptyList(), + o -> { + throw new RuntimeException("Some unexpected error"); + } + ); + + ConfigValue configValue = new ConfigValue("p", "v", Collections.emptyList(), Collections.emptyList()); + + assertNotEquals( + Optional.empty(), + validator.validate(configValue, null, Collections.emptyMap()) + ); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/PartitioningModeValidatorTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/PartitioningModeValidatorTest.java new file mode 100644 index 000000000..a4b79a14c --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/PartitioningModeValidatorTest.java @@ -0,0 +1,80 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.config; + +import org.junit.Test; + +import java.util.Optional; + +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class PartitioningModeValidatorTest { + + @Test + public void testDisabledDecoratorSyntaxSkipsValidation() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)).thenReturn(false); + + assertEquals( + Optional.empty(), + new PartitioningModeValidator().doValidate(config) + ); + } + + @Test + public void testDecoratorSyntaxWithoutTimestampPartitionFieldName() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)).thenReturn(true); + when(config.getTimestampPartitionFieldName()).thenReturn(Optional.empty()); + + assertEquals( + Optional.empty(), + new PartitioningModeValidator().doValidate(config) + ); + } + + @Test + public void testDecoratorSyntaxWithTimestampPartitionFieldName() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)).thenReturn(true); + when(config.getTimestampPartitionFieldName()).thenReturn(Optional.of("f1")); + + assertNotEquals( + Optional.empty(), + new PartitioningModeValidator().doValidate(config) + ); + } + + @Test + public void testTimestampPartitionFieldNameWithoutDecoratorSyntax() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)).thenReturn(false); + when(config.getTimestampPartitionFieldName()).thenReturn(Optional.of("f1")); + + assertEquals( + Optional.empty(), + new PartitioningModeValidator().doValidate(config) + ); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/PartitioningTypeValidatorTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/PartitioningTypeValidatorTest.java new file mode 100644 index 000000000..07d0489cf --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/PartitioningTypeValidatorTest.java @@ -0,0 +1,92 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.config; + +import com.google.cloud.bigquery.TimePartitioning; +import org.junit.Test; + +import java.util.Optional; + +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.TABLE_CREATE_CONFIG; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class PartitioningTypeValidatorTest { + + @Test + public void testDisabledDecoratorSyntaxSkipsValidation() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)).thenReturn(false); + when(config.getBoolean(TABLE_CREATE_CONFIG)).thenReturn(true); + + assertEquals( + Optional.empty(), + new PartitioningTypeValidator().doValidate(config) + ); + } + + @Test + public void testDisabledTableCreationSkipsValidation() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)).thenReturn(true); + when(config.getBoolean(TABLE_CREATE_CONFIG)).thenReturn(false); + + assertEquals( + Optional.empty(), + new PartitioningTypeValidator().doValidate(config) + ); + } + + @Test + public void testNonDayTimePartitioningWithTableCreationAndDecoratorSyntax() { + // TODO: This can be refactored into programmatically-generated test cases once we start using JUnit 5 + for (TimePartitioning.Type timePartitioningType : TimePartitioning.Type.values()) { + if (TimePartitioning.Type.DAY.equals(timePartitioningType)) { + continue; + } + + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)).thenReturn(true); + when(config.getBoolean(TABLE_CREATE_CONFIG)).thenReturn(true); + when(config.getTimePartitioningType()).thenReturn(Optional.of(timePartitioningType)); + + assertNotEquals( + Optional.empty(), + new PartitioningTypeValidator().doValidate(config) + ); + } + } + + @Test + public void testDayTimePartitioningWithTableCreationAndDecoratorSyntax() { + BigQuerySinkConfig config = mock(BigQuerySinkConfig.class); + when(config.getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)).thenReturn(true); + when(config.getBoolean(TABLE_CREATE_CONFIG)).thenReturn(true); + when(config.getTimePartitioningType()).thenReturn(Optional.of(TimePartitioning.Type.DAY)); + + assertEquals( + Optional.empty(), + new PartitioningTypeValidator().doValidate(config) + ); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/BigQueryRecordConverterTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/BigQueryRecordConverterTest.java index 23bdd6a63..d11990500 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/BigQueryRecordConverterTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/BigQueryRecordConverterTest.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.convert; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.convert; import static org.junit.Assert.assertEquals; diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/BigQuerySchemaConverterTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/BigQuerySchemaConverterTest.java index 70c34e9a5..307ac34ea 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/BigQuerySchemaConverterTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/BigQuerySchemaConverterTest.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.convert; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,14 +17,17 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.convert; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; import com.google.cloud.bigquery.Field; import com.google.cloud.bigquery.LegacySQLTypeName; import com.wepay.kafka.connect.bigquery.exception.ConversionConnectException; +import com.wepay.kafka.connect.bigquery.utils.FieldNameSanitizer; import org.apache.kafka.connect.data.Date; import org.apache.kafka.connect.data.Decimal; import org.apache.kafka.connect.data.Schema; @@ -33,6 +36,8 @@ import org.junit.Test; +import io.confluent.connect.avro.AvroData; + public class BigQuerySchemaConverterTest { @Test(expected = ConversionConnectException.class) @@ -417,6 +422,28 @@ public void testStringArray() { assertEquals(bigQueryExpectedSchema, bigQueryTestSchema); } + @Test + public void testFieldNameSanitized() { + final String fieldName = "String Array"; + com.google.cloud.bigquery.Schema bigQueryExpectedSchema = + com.google.cloud.bigquery.Schema.of( + com.google.cloud.bigquery.Field.newBuilder( + FieldNameSanitizer.sanitizeName(fieldName), + LegacySQLTypeName.STRING + ).setMode(com.google.cloud.bigquery.Field.Mode.REPEATED).build() + ); + + Schema kafkaConnectArraySchema = SchemaBuilder.array(Schema.STRING_SCHEMA).build(); + Schema kafkaConnectTestSchema = SchemaBuilder + .struct() + .field(fieldName, kafkaConnectArraySchema) + .build(); + + com.google.cloud.bigquery.Schema bigQueryTestSchema = + new BigQuerySchemaConverter(false, true).convertSchema(kafkaConnectTestSchema); + assertEquals(bigQueryExpectedSchema, bigQueryTestSchema); + } + @Test public void testBytes() { final String fieldName = "Bytes"; @@ -627,6 +654,46 @@ public void testAllFieldsNullable() { com.google.cloud.bigquery.Schema bigQueryTestSchema = new BigQuerySchemaConverter(true).convertSchema(kafkaConnectTestSchema); assertEquals(bigQueryExpectedSchema, bigQueryTestSchema); + } + @Test + public void testSimpleRecursiveSchemaThrows() { + final String fieldName = "RecursiveField"; + + // Construct Avro schema with recursion since we cannot directly construct Connect schema with cycle + org.apache.avro.Schema recursiveAvroSchema = org.apache.avro.SchemaBuilder + .record("RecursiveItem") + .namespace("com.example") + .fields() + .name(fieldName) + .type().unionOf().nullType().and().type("RecursiveItem").endUnion() + .nullDefault() + .endRecord(); + + Schema connectSchema = new AvroData(100).toConnectSchema(recursiveAvroSchema); + ConversionConnectException e = assertThrows(ConversionConnectException.class, () -> + new BigQuerySchemaConverter(true).convertSchema(connectSchema)); + assertEquals("Kafka Connect schema contains cycle", e.getMessage()); + } + + @Test + public void testComplexRecursiveSchemaThrows() { + final String fieldName = "RecursiveField"; + + // Construct Avro schema with recursion since we cannot directly construct Connect schema with cycle + org.apache.avro.Schema recursiveAvroSchema = org.apache.avro.SchemaBuilder + .record("RecursiveItem") + .namespace("com.example") + .fields() + .name(fieldName) + .type() + .array().items() + .map().values().type("RecursiveItem").noDefault() + .endRecord(); + + Schema connectSchema = new AvroData(100).toConnectSchema(recursiveAvroSchema); + ConversionConnectException e = assertThrows(ConversionConnectException.class, () -> + new BigQuerySchemaConverter(true).convertSchema(connectSchema)); + assertEquals("Kafka Connect schema contains cycle", e.getMessage()); } } diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/KafkaDataConverterTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/KafkaDataConverterTest.java index b2eefb22d..5833c7070 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/KafkaDataConverterTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/KafkaDataConverterTest.java @@ -1,3 +1,22 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package com.wepay.kafka.connect.bigquery.convert; diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/DebeziumLogicalConvertersTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/DebeziumLogicalConvertersTest.java index 91a5c0080..059fccbe4 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/DebeziumLogicalConvertersTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/DebeziumLogicalConvertersTest.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.convert.logicaltype; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.convert.logicaltype; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; @@ -39,6 +40,7 @@ public class DebeziumLogicalConvertersTest { //corresponds to March 1 2017, 22:20:38.808(123) UTC // (March 1 2017, 14:20:38.808(123)-8:00) private static final Integer DAYS_TIMESTAMP = 17226; + private static final Integer MILLI_TIMESTAMP_INT = 1488406838; private static final Long MILLI_TIMESTAMP = 1488406838808L; private static final Long MICRO_TIMESTAMP = 1488406838808123L; @@ -102,8 +104,8 @@ public void testTimeConversion() { fail("Expected encoding type check to succeed."); } - String formattedTime = converter.convert(MILLI_TIMESTAMP); - assertEquals("22:20:38.808", formattedTime); + String formattedTime = converter.convert(MILLI_TIMESTAMP_INT); + assertEquals("05:26:46.838", formattedTime); } @Test diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/KafkaLogicalConvertersTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/KafkaLogicalConvertersTest.java index c40a73221..5eb72902f 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/KafkaLogicalConvertersTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/convert/logicaltype/KafkaLogicalConvertersTest.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.convert.logicaltype; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.convert.logicaltype; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; @@ -26,6 +27,7 @@ import com.wepay.kafka.connect.bigquery.convert.logicaltype.KafkaLogicalConverters.DateConverter; import com.wepay.kafka.connect.bigquery.convert.logicaltype.KafkaLogicalConverters.DecimalConverter; import com.wepay.kafka.connect.bigquery.convert.logicaltype.KafkaLogicalConverters.TimestampConverter; +import com.wepay.kafka.connect.bigquery.convert.logicaltype.KafkaLogicalConverters.TimeConverter; import org.apache.kafka.connect.data.Schema; @@ -100,4 +102,32 @@ public void testTimestampConversion() { assertEquals("2017-03-01 22:20:38.808", formattedTimestamp); } + + + @Test + public void testTimeConversion() { + TimeConverter converter = new KafkaLogicalConverters.TimeConverter(); + + assertEquals(LegacySQLTypeName.TIME, converter.getBQSchemaType()); + + try { + converter.checkEncodingType(Schema.Type.INT32); + } catch (Exception ex) { + fail("Expected encoding type check to succeed."); + } + + try { + converter.checkEncodingType(Schema.Type.INT64); + fail("Expected encoding type check to fail"); + } catch (Exception ex) { + // continue + } + + // Can't use the same timestamp here as the one in other tests as the Time type + // should only fall on January 1st, 1970 + Date date = new Date(166838808); + String formattedTimestamp = converter.convert(date); + + assertEquals("22:20:38.808", formattedTimestamp); + } } diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/BaseConnectorIT.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/BaseConnectorIT.java new file mode 100644 index 000000000..e27b6e5c3 --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/BaseConnectorIT.java @@ -0,0 +1,387 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.integration; + +import java.time.LocalDate; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.FieldValue; +import com.google.cloud.bigquery.QueryJobConfiguration; +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.Table; +import com.google.cloud.bigquery.TableResult; +import com.wepay.kafka.connect.bigquery.GcpClientBuilder; +import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; +import com.wepay.kafka.connect.bigquery.utils.FieldNameSanitizer; +import org.apache.kafka.clients.admin.Admin; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.utils.Utils; +import org.apache.kafka.connect.runtime.AbstractStatus; +import org.apache.kafka.connect.runtime.WorkerConfig; +import org.apache.kafka.connect.runtime.rest.entities.ConnectorStateInfo; +import org.apache.kafka.connect.util.clusters.EmbeddedConnectCluster; +import org.apache.kafka.test.IntegrationTest; +import org.apache.kafka.test.NoRetryException; +import org.apache.kafka.test.TestUtils; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static com.google.cloud.bigquery.LegacySQLTypeName.BOOLEAN; +import static com.google.cloud.bigquery.LegacySQLTypeName.BYTES; +import static com.google.cloud.bigquery.LegacySQLTypeName.DATE; +import static com.google.cloud.bigquery.LegacySQLTypeName.FLOAT; +import static com.google.cloud.bigquery.LegacySQLTypeName.INTEGER; +import static com.google.cloud.bigquery.LegacySQLTypeName.STRING; +import static com.google.cloud.bigquery.LegacySQLTypeName.TIMESTAMP; +import static org.apache.kafka.connect.runtime.ConnectorConfig.CONNECTOR_CLASS_CONFIG; +import static org.apache.kafka.connect.runtime.ConnectorConfig.TASKS_MAX_CONFIG; +import static org.apache.kafka.test.TestUtils.waitForCondition; +import static org.junit.Assert.assertTrue; + +@Category(IntegrationTest.class) +public abstract class BaseConnectorIT { + private static final Logger logger = LoggerFactory.getLogger(BaseConnectorIT.class); + + private static final String KEYFILE_ENV_VAR = "KCBQ_TEST_KEYFILE"; + private static final String PROJECT_ENV_VAR = "KCBQ_TEST_PROJECT"; + private static final String DATASET_ENV_VAR = "KCBQ_TEST_DATASET"; + private static final String GCS_BUCKET_ENV_VAR = "KCBQ_TEST_BUCKET"; + private static final String GCS_FOLDER_ENV_VAR = "KCBQ_TEST_FOLDER"; + private static final String TEST_NAMESPACE_ENV_VAR = "KCBQ_TEST_TABLE_SUFFIX"; + + protected static final long OFFSET_COMMIT_INTERVAL_MS = TimeUnit.SECONDS.toMillis(10); + protected static final long COMMIT_MAX_DURATION_MS = TimeUnit.MINUTES.toMillis(5); + protected static final long OFFSETS_READ_TIMEOUT_MS = TimeUnit.SECONDS.toMillis(10); + protected static final long CONNECTOR_STARTUP_DURATION_MS = TimeUnit.SECONDS.toMillis(60); + + protected EmbeddedConnectCluster connect; + private Admin kafkaAdminClient; + + protected void startConnect() { + Map workerProps = new HashMap<>(); + workerProps.put( + WorkerConfig.OFFSET_COMMIT_INTERVAL_MS_CONFIG, Long.toString(OFFSET_COMMIT_INTERVAL_MS)); + // Allow per-connector consumer configuration for throughput testing + workerProps.put( + WorkerConfig.CONNECTOR_CLIENT_POLICY_CLASS_CONFIG, "All"); + + connect = new EmbeddedConnectCluster.Builder() + .name("kcbq-connect-cluster") + .workerProps(workerProps) + .build(); + + // start the clusters + connect.start(); + + kafkaAdminClient = connect.kafka().createAdminClient(); + + // the exception handler installed by the embedded zookeeper instance is noisy and unnecessary + Thread.setDefaultUncaughtExceptionHandler((t, e) -> { }); + } + + protected void stopConnect() { + if (kafkaAdminClient != null) { + Utils.closeQuietly(kafkaAdminClient, "admin client for embedded Kafka cluster"); + kafkaAdminClient = null; + } + + // stop all Connect, Kafka and Zk threads. + if (connect != null) { + Utils.closeQuietly(connect::stop, "embedded Connect, Kafka, and Zookeeper clusters"); + connect = null; + } + } + + protected Map baseConnectorProps(int tasksMax) { + Map result = new HashMap<>(); + + result.put(CONNECTOR_CLASS_CONFIG, "BigQuerySinkConnector"); + result.put(TASKS_MAX_CONFIG, Integer.toString(tasksMax)); + + result.put(BigQuerySinkConfig.PROJECT_CONFIG, project()); + result.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, dataset()); + result.put(BigQuerySinkConfig.KEYFILE_CONFIG, keyFile()); + result.put(BigQuerySinkConfig.KEY_SOURCE_CONFIG, keySource()); + + result.put(BigQuerySinkConfig.SANITIZE_TOPICS_CONFIG, "true"); + + return result; + } + + protected BigQuery newBigQuery() { + return new GcpClientBuilder.BigQueryBuilder() + .withKey(keyFile()) + .withKeySource(GcpClientBuilder.KeySource.valueOf(keySource())) + .withProject(project()) + .withUserAgent("ITTest-user-agent") + .build(); + } + + protected void waitForCommittedRecords( + String connector, String topic, long numRecords, int numTasks + ) throws InterruptedException { + waitForCommittedRecords(connector, Collections.singleton(topic), numRecords, numTasks, COMMIT_MAX_DURATION_MS); + } + + protected void waitForCommittedRecords( + String connector, Collection topics, long numRecords, int numTasks, long timeoutMs + ) throws InterruptedException { + waitForCondition( + () -> { + long totalCommittedRecords = totalCommittedRecords(connector, topics); + if (totalCommittedRecords >= numRecords) { + return true; + } else { + // Check to make sure the connector is still running. If not, fail fast + try { + assertTrue( + "Connector or one of its tasks failed during testing", + assertConnectorAndTasksRunning(connector, numTasks).orElse(false)); + } catch (AssertionError e) { + throw new NoRetryException(e); + } + logger.debug("Connector has only committed {} records for topics {} so far; {} expected", + totalCommittedRecords, topics, numRecords); + // Sleep here so as not to spam Kafka with list-offsets requests + Thread.sleep(OFFSET_COMMIT_INTERVAL_MS / 2); + return false; + } + }, + timeoutMs, + "Either the connector failed, or the message commit duration expired without all expected messages committed"); + } + + protected synchronized long totalCommittedRecords(String connector, Collection topics) throws TimeoutException, ExecutionException, InterruptedException { + // See https://github.com/apache/kafka/blob/f7c38d83c727310f4b0678886ba410ae2fae9379/connect/runtime/src/main/java/org/apache/kafka/connect/util/SinkUtils.java + // for how the consumer group ID is constructed for sink connectors + Map offsets = kafkaAdminClient + .listConsumerGroupOffsets("connect-" + connector) + .partitionsToOffsetAndMetadata() + .get(OFFSETS_READ_TIMEOUT_MS, TimeUnit.MILLISECONDS); + + logger.trace("Connector {} has so far committed offsets {}", connector, offsets); + + return offsets.entrySet().stream() + .filter(entry -> topics.contains(entry.getKey().topic())) + .mapToLong(entry -> entry.getValue().offset()) + .sum(); + } + + /** + * Read all rows from the given table. + * @param bigQuery used to connect to BigQuery + * @param tableName the table to read + * @param sortColumn a column to sort rows by (can use dot notation to refer to nested fields) + * @return a list of all rows from the table, in random order. + */ + protected List> readAllRows( + BigQuery bigQuery, String tableName, String sortColumn) throws InterruptedException { + + Table table = bigQuery.getTable(dataset(), tableName); + Schema schema = table + .getDefinition() + .getSchema(); + + TableResult tableResult = bigQuery.query(QueryJobConfiguration.of(String.format( + "SELECT * FROM `%s`.`%s` ORDER BY %s ASC", + dataset(), + tableName, + sortColumn + ))); + + return StreamSupport.stream(tableResult.iterateAll().spliterator(), false) + .map(fieldValues -> convertRow(schema.getFields(), fieldValues)) + .collect(Collectors.toList()); + } + + protected static List boxByteArray(byte[] bytes) { + Byte[] result = new Byte[bytes.length]; + for (int i = 0; i < bytes.length; i++) { + result[i] = bytes[i]; + } + return Arrays.asList(result); + } + + private Object convertField(Field fieldSchema, FieldValue field) { + if (field.isNull()) { + return null; + } + switch (field.getAttribute()) { + case PRIMITIVE: + if (fieldSchema.getType().equals(BOOLEAN)) { + return field.getBooleanValue(); + } else if (fieldSchema.getType().equals(BYTES)) { + // Do this in order for assertEquals() to work when this is an element of two compared + // lists + return boxByteArray(field.getBytesValue()); + } else if (fieldSchema.getType().equals(DATE)) { + DateTimeFormatter dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd"); + return LocalDate.parse(field.getStringValue(), dateFormatter) + .atStartOfDay(ZoneOffset.UTC) + .toInstant() + .toEpochMilli(); + } else if (fieldSchema.getType().equals(FLOAT)) { + return field.getDoubleValue(); + } else if (fieldSchema.getType().equals(INTEGER)) { + return field.getLongValue(); + } else if (fieldSchema.getType().equals(STRING)) { + return field.getStringValue(); + } else if (fieldSchema.getType().equals(TIMESTAMP)) { + return field.getTimestampValue(); + } else { + throw new RuntimeException("Cannot convert primitive field type " + + fieldSchema.getType()); + } + case REPEATED: + List result = new ArrayList<>(); + for (FieldValue arrayField : field.getRepeatedValue()) { + result.add(convertField(fieldSchema, arrayField)); + } + return result; + case RECORD: + List recordSchemas = fieldSchema.getSubFields(); + List recordFields = field.getRecordValue(); + return convertRow(recordSchemas, recordFields); + default: + throw new RuntimeException("Unknown field attribute: " + field.getAttribute()); + } + } + + private List convertRow(List rowSchema, List row) { + List result = new ArrayList<>(); + assert (rowSchema.size() == row.size()); + + for (int i = 0; i < rowSchema.size(); i++) { + result.add(convertField(rowSchema.get(i), row.get(i))); + } + + return result; + } + + /** + * Wait up to {@link #CONNECTOR_STARTUP_DURATION_MS maximum time limit} for the connector with the given + * name to start the specified number of tasks. + * + * @param name the name of the connector + * @param numTasks the minimum number of tasks that are expected + * @return the time this method discovered the connector has started, in milliseconds past epoch + * @throws InterruptedException if this was interrupted + */ + protected void waitForConnectorToStart(String name, int numTasks) throws InterruptedException { + waitForCondition( + () -> assertConnectorAndTasksRunning(name, numTasks).orElse(false), + CONNECTOR_STARTUP_DURATION_MS, + "Connector tasks did not start in time." + ); + } + + /** + * Confirm that a connector with an exact number of tasks is running. + * + * @param connectorName the connector + * @param numTasks the minimum number of tasks + * @return true if the connector and tasks are in RUNNING state; false otherwise + */ + protected Optional assertConnectorAndTasksRunning(String connectorName, int numTasks) { + try { + ConnectorStateInfo info = connect.connectorStatus(connectorName); + boolean result = info != null + && info.tasks().size() >= numTasks + && info.connector().state().equals(AbstractStatus.State.RUNNING.toString()) + && info.tasks().stream().allMatch(s -> s.state().equals(AbstractStatus.State.RUNNING.toString())); + return Optional.of(result); + } catch (Exception e) { + logger.debug("Could not check connector state info.", e); + return Optional.empty(); + } + } + + protected String suffixedTableOrTopic(String tableOrTopic) { + return tableOrTopic + tableSuffix(); + } + + protected String sanitizedTable(String table) { + return FieldNameSanitizer.sanitizeName(table); + } + + protected String suffixedAndSanitizedTable(String table) { + return sanitizedTable(suffixedTableOrTopic(table)); + } + + private String readEnvVar(String var) { + String result = System.getenv(var); + if (result == null) { + throw new IllegalStateException(String.format( + "Environment variable '%s' must be supplied to run integration tests", + var)); + } + return result; + } + + private String readEnvVar(String var, String defaultVal) { + return System.getenv().getOrDefault(var, defaultVal); + } + + protected String keyFile() { + return readEnvVar(KEYFILE_ENV_VAR); + } + + protected String project() { + return readEnvVar(PROJECT_ENV_VAR); + } + + protected String dataset() { + return readEnvVar(DATASET_ENV_VAR); + } + + protected String keySource() { + return BigQuerySinkConfig.KEY_SOURCE_DEFAULT; + } + + protected String gcsBucket() { + return readEnvVar(GCS_BUCKET_ENV_VAR); + } + + protected String gcsFolder() { + return readEnvVar(GCS_FOLDER_ENV_VAR, BigQuerySinkConfig.GCS_FOLDER_NAME_DEFAULT); + } + + protected String tableSuffix() { + return readEnvVar(TEST_NAMESPACE_ENV_VAR, ""); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/BigQueryErrorResponsesIT.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/BigQueryErrorResponsesIT.java new file mode 100644 index 000000000..17ccd89f4 --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/BigQueryErrorResponsesIT.java @@ -0,0 +1,283 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.integration; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.BigQueryError; +import com.google.cloud.bigquery.BigQueryException; +import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.InsertAllRequest; +import com.google.cloud.bigquery.InsertAllResponse; +import com.google.cloud.bigquery.LegacySQLTypeName; +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.StandardSQLTypeName; +import com.google.cloud.bigquery.StandardTableDefinition; +import com.google.cloud.bigquery.Table; +import com.google.cloud.bigquery.TableId; +import com.google.cloud.bigquery.TableInfo; +import com.wepay.kafka.connect.bigquery.integration.utils.TableClearer; +import com.wepay.kafka.connect.bigquery.write.row.BigQueryErrorResponses; +import org.apache.kafka.test.TestUtils; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.LongStream; + +import static com.google.cloud.bigquery.InsertAllRequest.RowToInsert; +import static com.wepay.kafka.connect.bigquery.utils.TableNameUtils.table; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +public class BigQueryErrorResponsesIT extends BaseConnectorIT { + + private static final Logger logger = LoggerFactory.getLogger(BigQueryErrorResponsesIT.class); + + private BigQuery bigQuery; + + @Before + public void setup() { + bigQuery = newBigQuery(); + } + + @Test + public void testWriteToNonExistentTable() { + TableId table = TableId.of(dataset(), suffixedAndSanitizedTable("nonexistent table")); + TableClearer.clearTables(bigQuery, dataset(), table.getTable()); + + try { + bigQuery.insertAll(InsertAllRequest.of(table, RowToInsert.of(Collections.singletonMap("f1", "v1")))); + fail("Should have failed to write to nonexistent table"); + } catch (BigQueryException e) { + logger.debug("Nonexistent table write error", e); + assertTrue(BigQueryErrorResponses.isNonExistentTableError(e)); + } + } + + @Test + public void testWriteToRecreatedTable() throws Exception { + TableId table = TableId.of(dataset(), suffixedAndSanitizedTable("recreated table")); + TableClearer.clearTables(bigQuery, dataset(), table.getTable()); + + Schema schema = Schema.of(Field.of("f1", LegacySQLTypeName.STRING)); + + // Create the table... + bigQuery.create(TableInfo.newBuilder(table, StandardTableDefinition.of(schema)).build()); + + // Make sure that it exists... + TestUtils.waitForCondition( + () -> bigQuery.getTable(table) != null, + 60_000L, + "Table does not appear to exist one minute after issuing create request" + ); + logger.info("Created {} successfully", table(table)); + + // Delete it... + bigQuery.delete(table); + + // Make sure that it's deleted + TestUtils.waitForCondition( + () -> bigQuery.getTable(table) == null, + 60_000L, + "Table still appears to exist one minute after issuing delete request" + ); + logger.info("Deleted {} successfully", table(table)); + + TestUtils.waitForCondition( + () -> { + // Try to write to it... + try { + bigQuery.insertAll(InsertAllRequest.of(table, RowToInsert.of(Collections.singletonMap("f1", "v1")))); + return false; + } catch (BigQueryException e) { + logger.debug("Deleted table write error", e); + return BigQueryErrorResponses.isNonExistentTableError(e); + } + }, + 60_000L, + "Never failed to write to just-deleted table" + ); + + // Recreate it... + bigQuery.create(TableInfo.newBuilder(table, StandardTableDefinition.of(schema)).build()); + + TestUtils.waitForCondition( + () -> { + // Try to write to it... + try { + bigQuery.insertAll(InsertAllRequest.of(table, RowToInsert.of(Collections.singletonMap("f1", "v1")))); + return true; + } catch (BigQueryException e) { + logger.debug("Recreated table write error", e); + return false; + } + }, + 60_000L, + "Never succeeded to write to just-recreated table" + ); + } + + @Test + public void testWriteToTableWithoutSchema() { + TableId table = TableId.of(dataset(), suffixedAndSanitizedTable("missing schema")); + createOrAssertSchemaMatches(table, Schema.of()); + + try { + bigQuery.insertAll(InsertAllRequest.of(table, RowToInsert.of(Collections.singletonMap("f1", "v1")))); + fail("Should have failed to write to table with no schema"); + } catch (BigQueryException e) { + logger.debug("Table missing schema write error", e); + assertTrue(BigQueryErrorResponses.isTableMissingSchemaError(e)); + } + } + + @Test + public void testWriteWithMissingRequiredFields() { + TableId table = TableId.of(dataset(), suffixedAndSanitizedTable("too many fields")); + Schema schema = Schema.of( + Field.newBuilder("f1", StandardSQLTypeName.STRING).setMode(Field.Mode.REQUIRED).build(), + Field.newBuilder("f2", StandardSQLTypeName.INT64).setMode(Field.Mode.REQUIRED).build(), + Field.newBuilder("f3", StandardSQLTypeName.BOOL).setMode(Field.Mode.NULLABLE).build() + ); + createOrAssertSchemaMatches(table, schema); + + InsertAllResponse response = bigQuery.insertAll(InsertAllRequest.of(table, RowToInsert.of(Collections.singletonMap("f2", 12L)))); + logger.debug("Write response errors for missing required field: {}", response.getInsertErrors()); + BigQueryError error = assertResponseHasSingleError(response); + assertTrue(BigQueryErrorResponses.isMissingRequiredFieldError(error)); + } + + @Test + public void testWriteWithUnrecognizedFields() { + TableId table = TableId.of(dataset(), suffixedAndSanitizedTable("not enough fields")); + Schema schema = Schema.of( + Field.newBuilder("f1", StandardSQLTypeName.STRING).setMode(Field.Mode.REQUIRED).build() + ); + createOrAssertSchemaMatches(table, schema); + + Map row = new HashMap<>(); + row.put("f1", "v1"); + row.put("f2", 12L); + InsertAllResponse response = bigQuery.insertAll(InsertAllRequest.of(table, RowToInsert.of(row))); + logger.debug("Write response errors for unrecognized field: {}", response.getInsertErrors()); + BigQueryError error = assertResponseHasSingleError(response); + assertTrue(BigQueryErrorResponses.isUnrecognizedFieldError(error)); + } + + @Test + public void testStoppedRowsDuringInvalidWrite() { + TableId table = TableId.of(dataset(), suffixedAndSanitizedTable("not enough fields")); + Schema schema = Schema.of( + Field.newBuilder("f1", StandardSQLTypeName.STRING).setMode(Field.Mode.REQUIRED).build() + ); + createOrAssertSchemaMatches(table, schema); + + Map row1 = new HashMap<>(); + row1.put("f1", "v1"); + row1.put("f2", 12L); + Map row2 = Collections.singletonMap("f1", "v2"); + InsertAllResponse response = bigQuery.insertAll(InsertAllRequest.of(table, RowToInsert.of(row1), RowToInsert.of(row2))); + logger.debug("Write response errors for unrecognized field and stopped row: {}", response.getInsertErrors()); + assertEquals(2, response.getInsertErrors().size()); + // As long as we have some kind of error on the first row it's fine; we want to be more precise in our assertions about the second row + assertListHasSingleElement(response.getErrorsFor(0)); + BigQueryError secondRowError = assertListHasSingleElement(response.getErrorsFor(1)); + assertTrue(BigQueryErrorResponses.isStoppedError(secondRowError)); + } + + @Test + public void testRequestPayloadTooLarge() { + TableId table = TableId.of(dataset(), suffixedAndSanitizedTable("request payload too large")); + Schema schema = Schema.of( + Field.newBuilder("f1", StandardSQLTypeName.STRING).setMode(Field.Mode.REQUIRED).build() + ); + createOrAssertSchemaMatches(table, schema); + + char[] chars = new char[10 * 1024 * 1024]; + Arrays.fill(chars, '*'); + String columnValue = new String(chars); + try { + bigQuery.insertAll(InsertAllRequest.of(table, RowToInsert.of(Collections.singletonMap("f1", columnValue)))); + fail("Should have failed to write to table with 11MB request"); + } catch (BigQueryException e) { + logger.debug("Large request payload write error", e); + assertTrue(BigQueryErrorResponses.isRequestTooLargeError(e)); + } + } + + @Test + public void testTooManyRows() { + TableId table = TableId.of(dataset(), suffixedAndSanitizedTable("too many rows")); + Schema schema = Schema.of( + Field.newBuilder("f1", StandardSQLTypeName.INT64).setMode(Field.Mode.REQUIRED).build() + ); + createOrAssertSchemaMatches(table, schema); + + Iterable rows = LongStream.range(0, 100_000) + .mapToObj(i -> Collections.singletonMap("f1", i)) + .map(RowToInsert::of) + .collect(Collectors.toList()); + try { + bigQuery.insertAll(InsertAllRequest.of(table, rows)); + fail("Should have failed to write to table with 100,000 rows"); + } catch (BigQueryException e) { + logger.debug("Too many rows write error", e); + assertTrue(BigQueryErrorResponses.isTooManyRowsError(e)); + } + } + + // Some tables can't be deleted, recreated, and written to without getting a temporary error from BigQuery, + // so we just create them once if they don't exist and don't delete them at the end of the test. + // If we detect a table left over (presumably from a prior test), we do a sanity check to make sure that it + // has the expected schema. + private void createOrAssertSchemaMatches(TableId tableId, Schema schema) { + Table table = bigQuery.getTable(tableId); + if (table == null) { + bigQuery.create(TableInfo.newBuilder(tableId, StandardTableDefinition.of(schema)).build()); + } else { + assertEquals( + String.format("Testing %s should be created automatically by tests; please delete the table and re-run this test", table(tableId)), + schema, + table.getDefinition().getSchema() + ); + } + } + + private BigQueryError assertResponseHasSingleError(InsertAllResponse response) { + assertEquals(1, response.getInsertErrors().size()); + Iterator> errorsIterator = response.getInsertErrors().values().iterator(); + assertTrue(errorsIterator.hasNext()); + return assertListHasSingleElement(errorsIterator.next()); + } + + private T assertListHasSingleElement(List list) { + assertEquals(1, list.size()); + return list.get(0); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/BigQuerySinkConnectorIT.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/BigQuerySinkConnectorIT.java new file mode 100644 index 000000000..16866e858 --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/BigQuerySinkConnectorIT.java @@ -0,0 +1,334 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.integration; + +import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; +import com.wepay.kafka.connect.bigquery.integration.utils.BucketClearer; +import com.wepay.kafka.connect.bigquery.integration.utils.TableClearer; +import com.wepay.kafka.connect.bigquery.retrieve.IdentitySchemaRetriever; +import com.wepay.kafka.connect.bigquery.utils.FieldNameSanitizer; +import io.confluent.connect.avro.AvroConverter; +import io.confluent.kafka.formatter.AvroMessageReader; +import io.confluent.kafka.schemaregistry.ClusterTestHarness; +import io.confluent.kafka.schemaregistry.CompatibilityLevel; +import io.confluent.kafka.schemaregistry.RestApp; +import io.confluent.kafka.schemaregistry.rest.SchemaRegistryConfig; +import kafka.common.MessageReader; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.Producer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.serialization.Serdes; +import org.apache.kafka.common.utils.Utils; +import org.apache.kafka.connect.runtime.ConnectorConfig; +import org.apache.kafka.connect.runtime.SinkConnectorConfig; +import org.apache.kafka.test.IntegrationTest; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Scanner; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import static com.wepay.kafka.connect.bigquery.integration.BaseConnectorIT.boxByteArray; +import static io.confluent.kafka.serializers.AbstractKafkaSchemaSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG; +import static org.junit.Assert.assertEquals; + +@Category(IntegrationTest.class) +@RunWith(Parameterized.class) +public class BigQuerySinkConnectorIT { + + @Parameterized.Parameters + public static Iterable testCases() { + Collection result = new ArrayList<>(); + + List> expectedGcsLoadRows = new ArrayList<>(); + expectedGcsLoadRows.add(Arrays.asList( + 1L, + null, + false, + 4242L, + 42424242424242L, + 42.42, + 42424242.42424242, + "forty-two", + boxByteArray(new byte[] { 0x0, 0xf, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78 }) + )); + expectedGcsLoadRows.add(Arrays.asList( + 2L, + 5L, + true, + 4354L, + 435443544354L, + 43.54, + 435443.544354, + "forty-three", + boxByteArray(new byte[] { 0x0, 0xf, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78 }) + )); + expectedGcsLoadRows.add(Arrays.asList( + 3L, + 8L, + false, + 1993L, + 199319931993L, + 19.93, + 199319.931993, + "nineteen", + boxByteArray(new byte[] { 0x0, 0xf, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78 }) + )); + result.add(new Object[] {"gcs-load", expectedGcsLoadRows}); + + List> expectedNullsRows = new ArrayList<>(); + expectedNullsRows.add(Arrays.asList(1L, "Required string", null, 42L, false)); + expectedNullsRows.add(Arrays.asList(2L, "Required string", "Optional string", 89L, null)); + expectedNullsRows.add(Arrays.asList(3L, "Required string", null, null, true)); + expectedNullsRows.add(Arrays.asList(4L, "Required string", "Optional string", null, null)); + result.add(new Object[] {"nulls", expectedNullsRows}); + + List> expectedMatryoshkaRows = new ArrayList<>(); + expectedMatryoshkaRows.add(Arrays.asList( + 1L, + Arrays.asList( + Arrays.asList(42.0, 42.42, 42.4242), + Arrays.asList( + 42L, + "42" + ) + ), + Arrays.asList( + -42L, + "-42" + ) + )); + result.add(new Object[] {"matryoshka-dolls", expectedMatryoshkaRows}); + + List> expectedPrimitivesRows = new ArrayList<>(); + expectedPrimitivesRows.add(Arrays.asList( + 1L, + null, + false, + 4242L, + 42424242424242L, + 42.42, + 42424242.42424242, + "forty-two", + boxByteArray(new byte[] { 0x0, 0xf, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78 }) + )); + result.add(new Object[] {"primitives", expectedPrimitivesRows}); + + List> expectedLogicalTypesRows = new ArrayList<>(); + expectedLogicalTypesRows.add(Arrays.asList(1L, 0L, 0L)); + expectedLogicalTypesRows.add(Arrays.asList(2L, 42000000000L, 362880000000L)); + expectedLogicalTypesRows.add(Arrays.asList(3L, 1468275102000000L, 1468195200000L)); + result.add(new Object[] {"logical-types", expectedLogicalTypesRows}); + + return result; + } + + private static final String TEST_CASE_PREFIX = "kcbq_test_"; + + // Share a single embedded Connect and Schema Registry cluster for all test cases to keep the runtime down + private static BaseConnectorIT testBase; + private static RestApp schemaRegistry; + private static String schemaRegistryUrl; + + private final String testCase; + private final List> expectedRows; + private final String topic; + private final String table; + private final String connectorName; + + private Producer valueProducer; + private int numRecordsProduced; + + public BigQuerySinkConnectorIT(String testCase, List> expectedRows) { + this.testCase = testCase; + this.expectedRows = expectedRows; + + this.topic = TEST_CASE_PREFIX + testCase; + this.table = testBase.suffixedAndSanitizedTable(topic); + this.connectorName = "bigquery-connector-" + testCase; + } + + @BeforeClass + public static void globalSetup() throws Exception { + testBase = new BaseConnectorIT() {}; + testBase.startConnect(); + + schemaRegistry = new RestApp( + ClusterTestHarness.choosePort(), + null, + testBase.connect.kafka().bootstrapServers(), + SchemaRegistryConfig.DEFAULT_KAFKASTORE_TOPIC, + CompatibilityLevel.BACKWARD.name, + true, + null); + + schemaRegistry.start(); + + schemaRegistryUrl = schemaRegistry.restClient.getBaseUrls().current(); + + BucketClearer.clearBucket( + testBase.keyFile(), + testBase.project(), + testBase.gcsBucket(), + testBase.gcsFolder(), + testBase.keySource() + ); + } + + @Before + public void setup() { + TableClearer.clearTables(testBase.newBigQuery(), testBase.dataset(), table); + + Map producerProps = new HashMap<>(); + producerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, testBase.connect.kafka().bootstrapServers()); + valueProducer = new KafkaProducer<>( + producerProps, Serdes.ByteArray().serializer(), Serdes.ByteArray().serializer()); + + numRecordsProduced = 0; + } + + @After + public void cleanup() { + testBase.connect.deleteConnector(connectorName); + } + + @AfterClass + public static void globalCleanup() { + if (schemaRegistry != null) { + Utils.closeQuietly(schemaRegistry::stop, "embedded Schema Registry instance"); + } + testBase.stopConnect(); + } + + @Test + public void runTestCase() throws Exception { + final int tasksMax = 1; + + populate(); + + testBase.connect.configureConnector(connectorName, connectorProps(tasksMax)); + + testBase.waitForConnectorToStart(connectorName, tasksMax); + + testBase.waitForCommittedRecords( + connectorName, Collections.singleton(topic), numRecordsProduced, tasksMax, TimeUnit.MINUTES.toMillis(3)); + + verify(); + } + + private void populate() { + testBase.connect.kafka().createTopic(topic); + + String testCaseDir = "integration_test_cases/" + testCase + "/"; + + InputStream schemaStream = BigQuerySinkConnectorIT.class.getClassLoader() + .getResourceAsStream(testCaseDir + "schema.json"); + Scanner schemaScanner = new Scanner(schemaStream).useDelimiter("\\A"); + String schemaString = schemaScanner.next(); + + Properties messageReaderProps = new Properties(); + messageReaderProps.put(SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryUrl); + messageReaderProps.put("value.schema", schemaString); + messageReaderProps.put("topic", topic); + InputStream dataStream = BigQuerySinkConnectorIT.class.getClassLoader() + .getResourceAsStream(testCaseDir + "data.json"); + MessageReader messageReader = new AvroMessageReader(); + messageReader.init(dataStream, messageReaderProps); + + ProducerRecord message = messageReader.readMessage(); + while (message != null) { + try { + valueProducer.send(message).get(1, TimeUnit.SECONDS); + numRecordsProduced++; + } catch (InterruptedException | ExecutionException | TimeoutException e) { + throw new RuntimeException(e); + } + message = messageReader.readMessage(); + } + } + + private Map connectorProps(int tasksMax) { + Map result = testBase.baseConnectorProps(tasksMax); + + result.put( + ConnectorConfig.KEY_CONVERTER_CLASS_CONFIG, + AvroConverter.class.getName()); + result.put( + ConnectorConfig.KEY_CONVERTER_CLASS_CONFIG + "." + SCHEMA_REGISTRY_URL_CONFIG, + schemaRegistryUrl); + result.put( + ConnectorConfig.VALUE_CONVERTER_CLASS_CONFIG, + AvroConverter.class.getName()); + result.put( + ConnectorConfig.VALUE_CONVERTER_CLASS_CONFIG + "." + SCHEMA_REGISTRY_URL_CONFIG, + schemaRegistryUrl); + + result.put(SinkConnectorConfig.TOPICS_CONFIG, topic); + + result.put(BigQuerySinkConfig.ALLOW_NEW_BIGQUERY_FIELDS_CONFIG, "true"); + result.put(BigQuerySinkConfig.ALLOW_BIGQUERY_REQUIRED_FIELD_RELAXATION_CONFIG, "true"); + result.put(BigQuerySinkConfig.ENABLE_BATCH_CONFIG, testBase.suffixedAndSanitizedTable("kcbq_test_gcs-load")); + result.put(BigQuerySinkConfig.BATCH_LOAD_INTERVAL_SEC_CONFIG, "10"); + result.put(BigQuerySinkConfig.GCS_BUCKET_NAME_CONFIG, testBase.gcsBucket()); + result.put(BigQuerySinkConfig.GCS_FOLDER_NAME_CONFIG, testBase.gcsFolder()); + result.put(BigQuerySinkConfig.SCHEMA_RETRIEVER_CONFIG, IdentitySchemaRetriever.class.getName()); + + String suffix = testBase.tableSuffix(); + if (!suffix.isEmpty()) { + String escapedSuffix = suffix.replaceAll("\\\\", "\\\\\\\\").replaceAll("\\$", "\\\\\\$"); + result.put("transforms", "addSuffix"); + result.put("transforms.addSuffix.type", "org.apache.kafka.connect.transforms.RegexRouter"); + result.put("transforms.addSuffix.regex", "(.*)"); + result.put("transforms.addSuffix.replacement", "$1" + escapedSuffix); + } + + return result; + } + + private void verify() { + List> testRows; + try { + String table = testBase.suffixedAndSanitizedTable(TEST_CASE_PREFIX + FieldNameSanitizer.sanitizeName(testCase)); + testRows = testBase.readAllRows(testBase.newBigQuery(), table, "row"); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + assertEquals(expectedRows, testRows); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/GcpClientBuilderIT.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/GcpClientBuilderIT.java new file mode 100644 index 000000000..64693e3f5 --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/GcpClientBuilderIT.java @@ -0,0 +1,67 @@ +package com.wepay.kafka.connect.bigquery.integration; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.DatasetId; +import com.google.cloud.storage.Storage; +import com.wepay.kafka.connect.bigquery.GcpClientBuilder; +import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; +import org.apache.kafka.test.IntegrationTest; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Map; + +@Category(IntegrationTest.class) +public class GcpClientBuilderIT extends BaseConnectorIT { + + private BigQuerySinkConfig connectorProps(GcpClientBuilder.KeySource keySource) throws IOException { + Map properties = baseConnectorProps(1); + properties.put(BigQuerySinkConfig.KEY_SOURCE_CONFIG, keySource.name()); + + if (keySource == GcpClientBuilder.KeySource.APPLICATION_DEFAULT) { + properties.put(BigQuerySinkConfig.KEYFILE_CONFIG, null); + } + else if (keySource == GcpClientBuilder.KeySource.JSON){ + // actually keyFile is the path to the credentials file, so we convert it to the json string + String credentialsJsonString = new String(Files.readAllBytes(Paths.get(keyFile())), StandardCharsets.UTF_8); + properties.put(BigQuerySinkConfig.KEYFILE_CONFIG, credentialsJsonString); + } + + return new BigQuerySinkConfig(properties); + } + + /** + * Construct the BigQuery and Storage clients and perform some basic operations to check they are operational. + * @param keySource the key Source to use + * @throws IOException + */ + private void testClients(GcpClientBuilder.KeySource keySource) throws IOException { + BigQuerySinkConfig config = connectorProps(keySource); + + BigQuery bigQuery = new GcpClientBuilder.BigQueryBuilder().withConfig(config).build(); + Storage storage = new GcpClientBuilder.GcsBuilder().withConfig(config).build(); + + bigQuery.listTables(DatasetId.of(dataset())); + storage.get(gcsBucket()); + } + + @Test + public void testApplicationDefaultCredentials() throws IOException { + testClients(GcpClientBuilder.KeySource.APPLICATION_DEFAULT); + } + + @Test + public void testFile() throws IOException { + testClients(GcpClientBuilder.KeySource.FILE); + } + + @Test + public void testJson() throws IOException { + testClients(GcpClientBuilder.KeySource.JSON); + } + +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/TimePartitioningIT.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/TimePartitioningIT.java new file mode 100644 index 000000000..a2cc8579d --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/TimePartitioningIT.java @@ -0,0 +1,315 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.integration; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.QueryJobConfiguration; +import com.google.cloud.bigquery.StandardTableDefinition; +import com.google.cloud.bigquery.TableId; +import com.google.cloud.bigquery.TableResult; +import com.google.cloud.bigquery.TimePartitioning; +import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; +import com.wepay.kafka.connect.bigquery.integration.utils.TableClearer; +import com.wepay.kafka.connect.bigquery.retrieve.IdentitySchemaRetriever; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.Producer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.serialization.Serdes; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.json.JsonConverter; +import org.apache.kafka.connect.json.JsonConverterConfig; +import org.apache.kafka.connect.runtime.SinkConnectorConfig; +import org.apache.kafka.connect.storage.Converter; +import org.apache.kafka.test.IntegrationTest; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.TimeUnit; + +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig.TIME_PARTITIONING_TYPE_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkTaskConfig.BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG; +import static com.wepay.kafka.connect.bigquery.config.BigQuerySinkTaskConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG; +import static org.apache.kafka.connect.runtime.ConnectorConfig.KEY_CONVERTER_CLASS_CONFIG; +import static org.apache.kafka.connect.runtime.ConnectorConfig.VALUE_CONVERTER_CLASS_CONFIG; +import static org.apache.kafka.test.TestUtils.waitForCondition; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +@Category(IntegrationTest.class) +@RunWith(Parameterized.class) +public class TimePartitioningIT { + + private static final Logger logger = LoggerFactory.getLogger(TimePartitioningIT.class); + + private static final long NUM_RECORDS_PRODUCED = 20; + private static final int TASKS_MAX = 1; + + private static BaseConnectorIT testBase; + + private BigQuery bigQuery; + + private final TimePartitioning.Type partitioningType; + private final boolean usePartitionDecorator; + private final boolean messageTimePartitioning; + private final int testCase; + private final long testStartTime; + private final String connectorName; + + public TimePartitioningIT( + TimePartitioning.Type partitioningType, + boolean usePartitionDecorator, + boolean messageTimePartitioning, + int testCase + ) { + this.partitioningType = partitioningType; + this.usePartitionDecorator = usePartitionDecorator; + this.messageTimePartitioning = messageTimePartitioning; + this.testCase = testCase; + this.testStartTime = System.currentTimeMillis(); + this.connectorName = "kcbq-time-partitioning-test-" + testCase; + } + + @Parameterized.Parameters(name = "{index}: partitioningType: {0}, usePartitionDecorator: {1}, messageTimePartitioning: {2}") + public static Iterable data() { + int testCase = 0; + return Arrays.asList( + new Object[] {TimePartitioning.Type.HOUR, false, false, testCase++ }, + new Object[] {TimePartitioning.Type.DAY, true, true, testCase++ }, + new Object[] {TimePartitioning.Type.DAY, true, false, testCase++ }, + new Object[] {TimePartitioning.Type.DAY, false, false, testCase++ }, + new Object[] {TimePartitioning.Type.MONTH, false, false, testCase++ }, + new Object[] {TimePartitioning.Type.YEAR, false, false, testCase } + ); + } + + @BeforeClass + public static void globalSetup() { + testBase = new BaseConnectorIT() {}; + BigQuery bigQuery = testBase.newBigQuery(); + data().forEach(args -> { + int testCase = (int) args[3]; + TableClearer.clearTables(bigQuery, testBase.dataset(), table(testCase)); + }); + testBase.startConnect(); + } + + @Before + public void setup() { + bigQuery = testBase.newBigQuery(); + } + + @After + public void close() { + bigQuery = null; + testBase.connect.deleteConnector(connectorName); + } + + @AfterClass + public static void globalCleanup() { + testBase.stopConnect(); + } + + private Map partitioningProps() { + Map result = new HashMap<>(); + + // use the JSON converter with schemas enabled + result.put(KEY_CONVERTER_CLASS_CONFIG, JsonConverter.class.getName()); + result.put(VALUE_CONVERTER_CLASS_CONFIG, JsonConverter.class.getName()); + + result.put(BIGQUERY_PARTITION_DECORATOR_CONFIG, Boolean.toString(usePartitionDecorator)); + result.put(BIGQUERY_MESSAGE_TIME_PARTITIONING_CONFIG, Boolean.toString(messageTimePartitioning)); + result.put(TIME_PARTITIONING_TYPE_CONFIG, partitioningType.name()); + + return result; + } + + private static String table(int testCase) { + return testBase.suffixedAndSanitizedTable("test-time-partitioning-" + testCase); + } + + @Test + public void testTimePartitioning() throws Throwable { + // create topic in Kafka + final String topic = testBase.suffixedTableOrTopic("test-time-partitioning-" + testCase); + testBase.connect.kafka().createTopic(topic); + + // setup props for the sink connector + Map props = testBase.baseConnectorProps(TASKS_MAX); + props.put(SinkConnectorConfig.TOPICS_CONFIG, topic); + + props.put(BigQuerySinkConfig.SANITIZE_TOPICS_CONFIG, "true"); + props.put(BigQuerySinkConfig.SCHEMA_RETRIEVER_CONFIG, IdentitySchemaRetriever.class.getName()); + props.put(BigQuerySinkConfig.TABLE_CREATE_CONFIG, "true"); + + props.putAll(partitioningProps()); + + // start a sink connector + testBase.connect.configureConnector(connectorName, props); + + // wait for tasks to spin up + testBase.waitForConnectorToStart(connectorName, TASKS_MAX); + + // Instantiate the converter we'll use to send records to the connector + Converter valueConverter = converter(); + + // Instantiate the producer we'll use to write records to Kafka + Map producerProps = new HashMap<>(); + producerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, testBase.connect.kafka().bootstrapServers()); + Producer valueProducer = new KafkaProducer<>( + producerProps, Serdes.Void().serializer(), Serdes.String().serializer() + ); + + // Send records to Kafka + for (int i = 0; i < NUM_RECORDS_PRODUCED; i++) { + String kafkaValue = value(valueConverter, topic, i); + logger.debug("Sending message with value '{}' to topic '{}'", kafkaValue, topic); + ProducerRecord kafkaRecord = new ProducerRecord<>(topic, null, timestamp((i % 3) - 1), null, kafkaValue); + try { + valueProducer.send(kafkaRecord).get(30, TimeUnit.SECONDS); + } catch (Exception e) { + throw new ConnectException("Failed to produce data to embedded Kafka cluster", e); + } + } + + // wait for tasks to write to BigQuery and commit offsets for their records + testBase.waitForCommittedRecords( + connectorName, + Collections.singleton(topic), + NUM_RECORDS_PRODUCED, + TASKS_MAX, + TimeUnit.MINUTES.toMillis(3) + ); + + String table = table(testCase); + + // Might fail to read from the table for a little bit; keep retrying until it's available + waitForCondition( + () -> { + try { + testBase.readAllRows(bigQuery, table, "i"); + return true; + } catch (RuntimeException e) { + logger.debug("Failed to read rows from table {}", table, e); + return false; + } + }, + TimeUnit.MINUTES.toMillis(5), + "Could not read from table to verify data after connector committed offsets for the expected number of records" + ); + + List> allRows = testBase.readAllRows(bigQuery, table, "i"); + // Just check to make sure we sent the expected number of rows to the table + assertEquals(NUM_RECORDS_PRODUCED, allRows.size()); + + // Ensure that the table was created with the expected time partitioning type + StandardTableDefinition tableDefinition = bigQuery.getTable(TableId.of(testBase.dataset(), table)).getDefinition(); + Optional actualPartitioningType = Optional.ofNullable((tableDefinition).getTimePartitioning()) + .map(TimePartitioning::getType); + assertEquals(Optional.of(partitioningType), actualPartitioningType); + + // Verify that at least one record landed in each of the targeted partitions + if (usePartitionDecorator && messageTimePartitioning) { + for (int i = -1; i < 2; i++) { + long partitionTime = timestamp(i); + TableResult tableResult = bigQuery.query(QueryJobConfiguration.of(String.format( + "SELECT * FROM `%s`.`%s` WHERE _PARTITIONTIME = TIMESTAMP_TRUNC(TIMESTAMP_MILLIS(%d), %s)", + testBase.dataset(), + table, + partitionTime, + partitioningType.toString() + ))); + assertTrue( + "Should have seen at least one row in partition corresponding to timestamp " + partitionTime, + tableResult.getValues().iterator().hasNext() + ); + } + } + } + + private Converter converter() { + Map props = new HashMap<>(); + props.put(JsonConverterConfig.SCHEMAS_ENABLE_CONFIG, true); + Converter result = new JsonConverter(); + result.configure(props, false); + return result; + } + + private String value(Converter converter, String topic, long iteration) { + final Schema schema = SchemaBuilder.struct() + .optional() + .field("i", Schema.INT64_SCHEMA) + .field("f1", Schema.STRING_SCHEMA) + .field("f2", Schema.BOOLEAN_SCHEMA) + .field("f3", Schema.FLOAT64_SCHEMA) + .build(); + + final Struct struct = new Struct(schema) + .put("i", iteration) + .put("f1", iteration % 2 == 0 ? "a string" : "another string") + .put("f2", iteration % 3 == 0) + .put("f3", iteration / 39.80); + + return new String(converter.fromConnectData(topic, schema, struct)); + } + + /** + * @param shiftAmount how many partitions forward/backward to shift the timestamp by, + * relative to the partition corresponding to the start of the test + */ + private long timestamp(long shiftAmount) { + long partitionDelta; + switch (partitioningType) { + case HOUR: + partitionDelta = TimeUnit.HOURS.toMillis(1); + break; + case DAY: + partitionDelta = TimeUnit.DAYS.toMillis(1); + break; + case MONTH: + partitionDelta = TimeUnit.DAYS.toMillis(31); + break; + case YEAR: + partitionDelta = TimeUnit.DAYS.toMillis(366); + break; + default: + throw new ConnectException("Unexpected partitioning type: " + partitioningType); + } + + return testStartTime + (shiftAmount * partitionDelta); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/UpsertDeleteBigQuerySinkConnectorIT.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/UpsertDeleteBigQuerySinkConnectorIT.java new file mode 100644 index 000000000..5f83e7ef8 --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/UpsertDeleteBigQuerySinkConnectorIT.java @@ -0,0 +1,411 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.integration; + +import com.google.cloud.bigquery.BigQuery; +import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; +import com.wepay.kafka.connect.bigquery.integration.utils.TableClearer; +import com.wepay.kafka.connect.bigquery.retrieve.IdentitySchemaRetriever; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.json.JsonConverter; +import org.apache.kafka.connect.json.JsonConverterConfig; +import org.apache.kafka.connect.runtime.ConnectorConfig; +import org.apache.kafka.connect.runtime.SinkConnectorConfig; +import org.apache.kafka.connect.storage.Converter; +import org.apache.kafka.test.IntegrationTest; +import org.junit.After; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import java.util.stream.LongStream; + +import static org.apache.kafka.connect.runtime.ConnectorConfig.KEY_CONVERTER_CLASS_CONFIG; +import static org.apache.kafka.connect.runtime.ConnectorConfig.VALUE_CONVERTER_CLASS_CONFIG; +import static org.junit.Assert.assertEquals; + +@Category(IntegrationTest.class) +public class UpsertDeleteBigQuerySinkConnectorIT extends BaseConnectorIT { + + private static final Logger logger = LoggerFactory.getLogger(UpsertDeleteBigQuerySinkConnectorIT.class); + + private static final String CONNECTOR_NAME = "kcbq-sink-connector"; + private static final long NUM_RECORDS_PRODUCED = 20; + private static final int TASKS_MAX = 3; + private static final String KAFKA_FIELD_NAME = "kafkaKey"; + + private BigQuery bigQuery; + + @Before + public void setup() { + bigQuery = newBigQuery(); + startConnect(); + } + + @After + public void close() { + bigQuery = null; + stopConnect(); + } + + private Map upsertDeleteProps( + boolean upsert, + boolean delete, + long mergeRecordsThreshold) { + if (!upsert && !delete) { + throw new IllegalArgumentException("At least one of upsert or delete must be enabled"); + } + + Map result = new HashMap<>(); + + // use the JSON converter with schemas enabled + result.put(KEY_CONVERTER_CLASS_CONFIG, JsonConverter.class.getName()); + result.put(VALUE_CONVERTER_CLASS_CONFIG, JsonConverter.class.getName()); + + if (upsert) { + result.put(BigQuerySinkConfig.UPSERT_ENABLED_CONFIG, "true"); + } + if (delete) { + result.put(BigQuerySinkConfig.DELETE_ENABLED_CONFIG, "true"); + } + + // Hardcode merge flushes to just use number of records for now, as it's more deterministic and + // faster to test + result.put(BigQuerySinkConfig.MERGE_INTERVAL_MS_CONFIG, "-1"); + result.put(BigQuerySinkConfig.MERGE_RECORDS_THRESHOLD_CONFIG, Long.toString(mergeRecordsThreshold)); + + result.put(BigQuerySinkConfig.KAFKA_KEY_FIELD_NAME_CONFIG, KAFKA_FIELD_NAME); + + return result; + } + + @Test + public void testUpsert() throws Throwable { + // create topic in Kafka + final String topic = suffixedTableOrTopic("test-upsert"); + // Make sure each task gets to read from at least one partition + connect.kafka().createTopic(topic, TASKS_MAX); + + final String table = sanitizedTable(topic); + TableClearer.clearTables(bigQuery, dataset(), table); + + // setup props for the sink connector + Map props = baseConnectorProps(TASKS_MAX); + props.put(SinkConnectorConfig.TOPICS_CONFIG, topic); + + props.put(BigQuerySinkConfig.SANITIZE_TOPICS_CONFIG, "true"); + props.put(BigQuerySinkConfig.SCHEMA_RETRIEVER_CONFIG, IdentitySchemaRetriever.class.getName()); + props.put(BigQuerySinkConfig.TABLE_CREATE_CONFIG, "true"); + + // Enable only upsert and not delete, and merge flush every other record + props.putAll(upsertDeleteProps(true, false, 2)); + + // start a sink connector + connect.configureConnector(CONNECTOR_NAME, props); + + // wait for tasks to spin up + waitForConnectorToStart(CONNECTOR_NAME, TASKS_MAX); + + // Instantiate the converters we'll use to send records to the connector + Converter keyConverter = converter(true); + Converter valueConverter = converter(false); + + // Send records to Kafka + for (int i = 0; i < NUM_RECORDS_PRODUCED; i++) { + // Each pair of records will share a key. Only the second record of each pair should be + // present in the table at the end of the test + String kafkaKey = key(keyConverter, topic, i / 2); + String kafkaValue = value(valueConverter, topic, i, false); + logger.debug("Sending message with key '{}' and value '{}' to topic '{}'", kafkaKey, kafkaValue, topic); + connect.kafka().produce(topic, kafkaKey, kafkaValue); + } + + // wait for tasks to write to BigQuery and commit offsets for their records + waitForCommittedRecords(CONNECTOR_NAME, topic, NUM_RECORDS_PRODUCED, TASKS_MAX); + + List> allRows = readAllRows(bigQuery, table, KAFKA_FIELD_NAME + ".k1"); + List> expectedRows = LongStream.range(0, NUM_RECORDS_PRODUCED / 2) + .mapToObj(i -> Arrays.asList( + "another string", + (i - 1) % 3 == 0, + (i * 2 + 1) / 0.69, + Collections.singletonList(i))) + .collect(Collectors.toList()); + assertEquals(expectedRows, allRows); + } + + @Test + public void testDelete() throws Throwable { + // create topic in Kafka + final String topic = suffixedTableOrTopic("test-delete"); + // Make sure each task gets to read from at least one partition + connect.kafka().createTopic(topic, TASKS_MAX); + + final String table = sanitizedTable(topic); + TableClearer.clearTables(bigQuery, dataset(), table); + + // setup props for the sink connector + Map props = baseConnectorProps(TASKS_MAX); + props.put(SinkConnectorConfig.TOPICS_CONFIG, topic); + + props.put(BigQuerySinkConfig.SANITIZE_TOPICS_CONFIG, "true"); + props.put(BigQuerySinkConfig.SCHEMA_RETRIEVER_CONFIG, IdentitySchemaRetriever.class.getName()); + props.put(BigQuerySinkConfig.TABLE_CREATE_CONFIG, "true"); + + // Enable only delete and not upsert, and merge flush every other record + props.putAll(upsertDeleteProps(false, true, 2)); + + // start a sink connector + connect.configureConnector(CONNECTOR_NAME, props); + + // wait for tasks to spin up + waitForConnectorToStart(CONNECTOR_NAME, TASKS_MAX); + + // Instantiate the converters we'll use to send records to the connector + Converter keyConverter = converter(true); + Converter valueConverter = converter(false); + + // Send records to Kafka + for (int i = 0; i < NUM_RECORDS_PRODUCED; i++) { + // Each pair of records will share a key. Because upsert is not enabled, no deduplication will take place + // and, unless a tombstone is written for that key, both will be inserted + String kafkaKey = key(keyConverter, topic, i / 2); + // Every fourth record will be a tombstone, so every record pair with an odd-numbered key will be dropped + String kafkaValue = value(valueConverter, topic, i, i % 4 == 3); + logger.debug("Sending message with key '{}' and value '{}' to topic '{}'", kafkaKey, kafkaValue, topic); + connect.kafka().produce(topic, kafkaKey, kafkaValue); + } + + // wait for tasks to write to BigQuery and commit offsets for their records + waitForCommittedRecords(CONNECTOR_NAME, topic, NUM_RECORDS_PRODUCED, TASKS_MAX); + + // Since we have multiple rows per key, order by key and the f3 field (which should be + // monotonically increasing in insertion order) + List> allRows = readAllRows(bigQuery, table, KAFKA_FIELD_NAME + ".k1, f3"); + List> expectedRows = LongStream.range(0, NUM_RECORDS_PRODUCED) + .filter(i -> i % 4 < 2) + .mapToObj(i -> Arrays.asList( + i % 4 == 0 ? "a string" : "another string", + i % 3 == 0, + i / 0.69, + Collections.singletonList(i * 2 / 4))) + .collect(Collectors.toList()); + assertEquals(expectedRows, allRows); + } + + @Test + public void testUpsertDelete() throws Throwable { + // create topic in Kafka + final String topic = suffixedTableOrTopic("test-upsert-delete"); + // Make sure each task gets to read from at least one partition + connect.kafka().createTopic(topic, TASKS_MAX); + + final String table = sanitizedTable(topic); + TableClearer.clearTables(bigQuery, dataset(), table); + + // setup props for the sink connector + Map props = baseConnectorProps(TASKS_MAX); + props.put(SinkConnectorConfig.TOPICS_CONFIG, topic); + + props.put(BigQuerySinkConfig.SANITIZE_TOPICS_CONFIG, "true"); + props.put(BigQuerySinkConfig.SCHEMA_RETRIEVER_CONFIG, IdentitySchemaRetriever.class.getName()); + props.put(BigQuerySinkConfig.TABLE_CREATE_CONFIG, "true"); + + // Enable upsert and delete, and merge flush every other record + props.putAll(upsertDeleteProps(true, true, 2)); + + // start a sink connector + connect.configureConnector(CONNECTOR_NAME, props); + + // wait for tasks to spin up + waitForConnectorToStart(CONNECTOR_NAME, TASKS_MAX); + + // Instantiate the converters we'll use to send records to the connector + Converter keyConverter = converter(true); + Converter valueConverter = converter(false); + + // Send records to Kafka + for (int i = 0; i < NUM_RECORDS_PRODUCED; i++) { + // Each pair of records will share a key. Only the second record of each pair should be + // present in the table at the end of the test + String kafkaKey = key(keyConverter, topic, i / 2); + // Every fourth record will be a tombstone, so every record pair with an odd-numbered key will be dropped + String kafkaValue = value(valueConverter, topic, i, i % 4 == 3); + logger.debug("Sending message with key '{}' and value '{}' to topic '{}'", kafkaKey, kafkaValue, topic); + connect.kafka().produce(topic, kafkaKey, kafkaValue); + } + + // wait for tasks to write to BigQuery and commit offsets for their records + waitForCommittedRecords(CONNECTOR_NAME, topic, NUM_RECORDS_PRODUCED, TASKS_MAX); + + // Since we have multiple rows per key, order by key and the f3 field (which should be + // monotonically increasing in insertion order) + List> allRows = readAllRows(bigQuery, table, KAFKA_FIELD_NAME + ".k1, f3"); + List> expectedRows = LongStream.range(0, NUM_RECORDS_PRODUCED) + .filter(i -> i % 4 == 1) + .mapToObj(i -> Arrays.asList( + "another string", + i % 3 == 0, + i / 0.69, + Collections.singletonList(i * 2 / 4))) + .collect(Collectors.toList()); + assertEquals(expectedRows, allRows); + } + + @Test + @Ignore("Skipped during regular testing; comment-out annotation to run") + public void testUpsertDeleteHighThroughput() throws Throwable { + final long numRecords = 1_000_000L; + final int numPartitions = 10; + final int tasksMax = 1; + + // create topic in Kafka + final String topic = suffixedTableOrTopic("test-upsert-delete-throughput"); + connect.kafka().createTopic(topic, numPartitions); + + final String table = sanitizedTable(topic); + TableClearer.clearTables(bigQuery, dataset(), table); + + // Instantiate the converters we'll use to send records to the connector + Converter keyConverter = converter(true); + Converter valueConverter = converter(false); + + // Send records to Kafka. Pre-populate Kafka before starting the connector as we want to measure + // the connector's throughput cleanly + logger.info("Pre-populating Kafka with test data"); + for (int i = 0; i < numRecords; i++) { + if (i % 10000 == 0) { + logger.info("{} records produced so far", i); + } + // Each pair of records will share a key. Only the second record of each pair should be + // present in the table at the end of the test + String kafkaKey = key(keyConverter, topic, i / 2); + // Every fourth record will be a tombstone, so every record pair with an odd-numbered key will + // be dropped + String kafkaValue = value(valueConverter, topic, i, i % 4 == 3); + connect.kafka().produce(topic, kafkaKey, kafkaValue); + } + + // setup props for the sink connector + // use a single task + Map props = baseConnectorProps(tasksMax); + props.put(SinkConnectorConfig.TOPICS_CONFIG, topic); + // Allow for at most 10,000 records per call to poll + props.put(ConnectorConfig.CONNECTOR_CLIENT_CONSUMER_OVERRIDES_PREFIX + + ConsumerConfig.MAX_POLL_RECORDS_CONFIG, + "10000"); + // Try to get at least 1 MB per partition with each request + props.put(ConnectorConfig.CONNECTOR_CLIENT_CONSUMER_OVERRIDES_PREFIX + + ConsumerConfig.FETCH_MIN_BYTES_CONFIG, + Integer.toString(ConsumerConfig.DEFAULT_MAX_PARTITION_FETCH_BYTES * numPartitions)); + // Wait up to one second for each batch to reach the requested size + props.put(ConnectorConfig.CONNECTOR_CLIENT_CONSUMER_OVERRIDES_PREFIX + + ConsumerConfig.FETCH_MAX_WAIT_MS_CONFIG, + "1000" + ); + + props.put(BigQuerySinkConfig.SANITIZE_TOPICS_CONFIG, "true"); + props.put(BigQuerySinkConfig.SCHEMA_RETRIEVER_CONFIG, IdentitySchemaRetriever.class.getName()); + props.put(BigQuerySinkConfig.TABLE_CREATE_CONFIG, "true"); + + // Enable upsert and delete, and schedule ten total flushes + props.putAll(upsertDeleteProps(true, true, numRecords / 10)); + + logger.info("Pre-population complete; creating connector"); + long start = System.currentTimeMillis(); + // start a sink connector + connect.configureConnector(CONNECTOR_NAME, props); + + // wait for tasks to spin up + waitForConnectorToStart(CONNECTOR_NAME, tasksMax); + + // wait for tasks to write to BigQuery and commit offsets for their records + waitForCommittedRecords( + CONNECTOR_NAME, Collections.singleton(topic), numRecords, tasksMax, TimeUnit.MINUTES.toMillis(10)); + long time = System.currentTimeMillis() - start; + logger.info("All records have been read and committed by the connector; " + + "total time from start to finish: {} seconds", time / 1000.0); + + // Since we have multiple rows per key, order by key and the f3 field (which should be + // monotonically increasing in insertion order) + List> allRows = readAllRows(bigQuery, table, KAFKA_FIELD_NAME + ".k1, f3"); + List> expectedRows = LongStream.range(0, numRecords) + .filter(i -> i % 4 == 1) + .mapToObj(i -> Arrays.asList( + "another string", + i % 3 == 0, + i / 0.69, + Collections.singletonList(i * 2 / 4))) + .collect(Collectors.toList()); + assertEquals(expectedRows, allRows); + } + + private Converter converter(boolean isKey) { + Map props = new HashMap<>(); + props.put(JsonConverterConfig.SCHEMAS_ENABLE_CONFIG, true); + Converter result = new JsonConverter(); + result.configure(props, isKey); + return result; + } + + private String key(Converter converter, String topic, long iteration) { + final Schema schema = SchemaBuilder.struct() + .field("k1", Schema.INT64_SCHEMA) + .build(); + + final Struct struct = new Struct(schema) + .put("k1", iteration); + + return new String(converter.fromConnectData(topic, schema, struct)); + } + + private String value(Converter converter, String topic, long iteration, boolean tombstone) { + final Schema schema = SchemaBuilder.struct() + .optional() + .field("f1", Schema.STRING_SCHEMA) + .field("f2", Schema.BOOLEAN_SCHEMA) + .field("f3", Schema.FLOAT64_SCHEMA) + .build(); + + if (tombstone) { + return new String(converter.fromConnectData(topic, schema, null)); + } + + final Struct struct = new Struct(schema) + .put("f1", iteration % 2 == 0 ? "a string" : "another string") + .put("f2", iteration % 3 == 0) + .put("f3", iteration / 0.69); + + return new String(converter.fromConnectData(topic, schema, struct)); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/utils/BucketClearer.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/utils/BucketClearer.java new file mode 100644 index 000000000..26b40e3fe --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/utils/BucketClearer.java @@ -0,0 +1,76 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.integration.utils; + +import com.google.api.gax.paging.Page; +import com.google.cloud.storage.Blob; +import com.google.cloud.storage.Bucket; +import com.google.cloud.storage.Storage; +import com.wepay.kafka.connect.bigquery.GcpClientBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class BucketClearer { + + private static final Logger logger = LoggerFactory.getLogger(BucketClearer.class); + + /** + * Clear out a GCS bucket. Useful in integration testing to provide a clean slate before creating + * a connector and writing to that bucket. + * @param key The GCP credentials to use (can be a filename or a raw JSON string). + * @param project The GCP project the bucket belongs to. + * @param bucketName The bucket to clear. + * @param folderName The folder to clear (can be empty or null). + * @param keySource The key source. If "FILE", then the {@code key} parameter will be treated as a + * filename; if "JSON", then {@code key} will be treated as a raw JSON string. + */ + public static void clearBucket(String key, String project, String bucketName, String folderName, String keySource) { + Storage gcs = new GcpClientBuilder.GcsBuilder() + .withKeySource(GcpClientBuilder.KeySource.valueOf(keySource)) + .withKey(key) + .withProject(project) + .build(); + Bucket bucket = gcs.get(bucketName); + if (bucket != null) { + logger.info("Deleting objects in the {} folder for bucket {}", + humanReadableFolderName(folderName), bucketName); + for (Blob blob : listBlobs(bucket, folderName)) { + gcs.delete(blob.getBlobId()); + } + bucket.delete(); + logger.info("Bucket {} deleted successfully", bucketName); + } else { + logger.info("Bucket {} does not exist", bucketName); + } + } + + private static String humanReadableFolderName(String folderName) { + return folderName == null || folderName.isEmpty() + ? "root" + : "'" + folderName + "'"; + } + + private static Iterable listBlobs(Bucket bucket, String folderName) { + Page blobListing = folderName == null || folderName.isEmpty() + ? bucket.list() + : bucket.list(Storage.BlobListOption.prefix(folderName)); + return blobListing.iterateAll(); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/utils/TableClearer.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/utils/TableClearer.java new file mode 100644 index 000000000..ea708bb19 --- /dev/null +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/integration/utils/TableClearer.java @@ -0,0 +1,64 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.wepay.kafka.connect.bigquery.integration.utils; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.TableId; +import com.wepay.kafka.connect.bigquery.utils.FieldNameSanitizer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; +import java.util.Collection; + +import static com.wepay.kafka.connect.bigquery.utils.TableNameUtils.table; + +public class TableClearer { + private static final Logger logger = LoggerFactory.getLogger(TableClearer.class); + + /** + * Clear out one or more BigQuery tables. Useful in integration testing to provide a clean slate + * before creating a connector and writing to those tables. + * @param bigQuery The BigQuery client to use when sending table deletion requests. + * @param dataset The dataset that the to-be-cleared tables belong to. + * @param tables The tables to clear. + */ + public static void clearTables(BigQuery bigQuery, String dataset, Collection tables) { + for (String tableName : tables) { + TableId table = TableId.of(dataset, FieldNameSanitizer.sanitizeName(tableName)); + if (bigQuery.delete(table)) { + logger.info("{} deleted successfully", table(table)); + } else { + logger.info("{} does not exist", table(table)); + } + } + } + + /** + * Clear out one or more BigQuery tables. Useful in integration testing to provide a clean slate + * before creating a connector and writing to those tables. + * @param bigQuery The BigQuery client to use when sending table deletion requests. + * @param dataset The dataset that the to-be-cleared tables belong to. + * @param tables The tables to clear. + */ + public static void clearTables(BigQuery bigQuery, String dataset, String... tables) { + clearTables(bigQuery, dataset, Arrays.asList(tables)); + } +} diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/utils/FieldNameSanitizerTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/utils/FieldNameSanitizerTest.java index 3358c1386..70d0d508b 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/utils/FieldNameSanitizerTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/utils/FieldNameSanitizerTest.java @@ -1,5 +1,25 @@ +/* + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package com.wepay.kafka.connect.bigquery.utils; +import java.util.Collections; import java.util.Map; import org.junit.Before; import org.junit.Test; @@ -75,4 +95,26 @@ public void testInvalidSymbol() { // Validate map size. assertEquals(5, sanitizedMap.size()); } + + /** + * Verifies that null values are acceptable while sanitizing keys. + */ + @Test + public void testNullValue() { + assertEquals( + Collections.singletonMap("abc", null), + FieldNameSanitizer.replaceInvalidKeys(Collections.singletonMap("abc", null))); + } + + @Test + public void testDeeplyNestedNullValues() { + testMap = new HashMap<>(); + testMap.put("top", null); + testMap.put("middle", Collections.singletonMap("key", null)); + testMap.put("bottom", Collections.singletonMap("key", Collections.singletonMap("key", null))); + assertEquals( + testMap, + FieldNameSanitizer.replaceInvalidKeys(testMap) + ); + } } diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/utils/PartitionedTableIdTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/utils/PartitionedTableIdTest.java index 1737bfbdf..b6135ffa8 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/utils/PartitionedTableIdTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/utils/PartitionedTableIdTest.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.utils; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.utils; import com.google.cloud.bigquery.TableId; @@ -25,6 +26,7 @@ import java.time.Instant; import java.time.LocalDate; +import java.time.LocalDateTime; public class PartitionedTableIdTest { diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/write/row/BigQueryWriterTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/write/row/BigQueryWriterTest.java index d1d4d9477..d96273ca5 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/write/row/BigQueryWriterTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/write/row/BigQueryWriterTest.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.write.row; - /* - * Copyright 2016 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,8 +17,11 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.write.row; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; +import static org.junit.Assert.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Matchers.anyObject; import static org.mockito.Mockito.mock; @@ -32,11 +35,12 @@ import com.google.cloud.bigquery.InsertAllRequest; import com.google.cloud.bigquery.InsertAllResponse; import com.google.cloud.bigquery.Table; +import com.google.cloud.bigquery.TableId; import com.google.cloud.storage.Storage; import com.wepay.kafka.connect.bigquery.BigQuerySinkTask; import com.wepay.kafka.connect.bigquery.SchemaManager; -import com.wepay.kafka.connect.bigquery.SinkTaskPropertiesFactory; +import com.wepay.kafka.connect.bigquery.SinkPropertiesFactory; import com.wepay.kafka.connect.bigquery.api.SchemaRetriever; import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; import com.wepay.kafka.connect.bigquery.config.BigQuerySinkTaskConfig; @@ -55,6 +59,7 @@ import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -62,11 +67,11 @@ @SuppressWarnings("unchecked") public class BigQueryWriterTest { - private static SinkTaskPropertiesFactory propertiesFactory; + private static SinkPropertiesFactory propertiesFactory; @BeforeClass public static void initializePropertiesFactory() { - propertiesFactory = new SinkTaskPropertiesFactory(); + propertiesFactory = new SinkPropertiesFactory(); } @Test @@ -79,12 +84,9 @@ public void testBigQueryNoFailure() { Table mockTable = mock(Table.class); when(bigQuery.getTable(any())).thenReturn(mockTable); - Map> emptyMap = mock(Map.class); - when(emptyMap.isEmpty()).thenReturn(true); - InsertAllResponse insertAllResponse = mock(InsertAllResponse.class); when(insertAllResponse.hasErrors()).thenReturn(false); - when(insertAllResponse.getInsertErrors()).thenReturn(emptyMap); + when(insertAllResponse.getInsertErrors()).thenReturn(Collections.emptyMap()); //first attempt (success) when(bigQuery.insertAll(anyObject())) @@ -96,7 +98,9 @@ public void testBigQueryNoFailure() { SchemaManager schemaManager = mock(SchemaManager.class); Storage storage = mock(Storage.class); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + Map cache = new HashMap<>(); + + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put( @@ -111,27 +115,28 @@ public void testAutoCreateTables() { final String topic = "test_topic"; final String dataset = "scratch"; final Map properties = makeProperties("3", "2000", topic, dataset); - properties.put(BigQuerySinkTaskConfig.TABLE_CREATE_CONFIG, "true"); + properties.put(BigQuerySinkConfig.TABLE_CREATE_CONFIG, "true"); BigQuery bigQuery = mock(BigQuery.class); - Map> emptyMap = mock(Map.class); - when(emptyMap.isEmpty()).thenReturn(true); InsertAllResponse insertAllResponse = mock(InsertAllResponse.class); when(insertAllResponse.hasErrors()).thenReturn(false); - when(insertAllResponse.getInsertErrors()).thenReturn(emptyMap); + when(insertAllResponse.getInsertErrors()).thenReturn(Collections.emptyMap()); - BigQueryException missTableException = mock(BigQueryException.class); - when(missTableException.getCode()).thenReturn(404); + String errorMessage = "Not found: Table project.scratch.test_topic"; + BigQueryError error = new BigQueryError("notFound", "global", errorMessage); + BigQueryException nonExistentTableException = new BigQueryException(404, errorMessage, error); - when(bigQuery.insertAll(anyObject())).thenThrow(missTableException).thenReturn(insertAllResponse); + when(bigQuery.insertAll(anyObject())).thenThrow(nonExistentTableException).thenReturn(insertAllResponse); SinkTaskContext sinkTaskContext = mock(SinkTaskContext.class); Storage storage = mock(Storage.class); SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + Map cache = new HashMap<>(); + + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put( @@ -142,7 +147,7 @@ public void testAutoCreateTables() { verify(bigQuery, times(2)).insertAll(anyObject()); } - @Test + @Test(expected = BigQueryConnectException.class) public void testNonAutoCreateTables() { final String topic = "test_topic"; final String dataset = "scratch"; @@ -152,14 +157,11 @@ public void testNonAutoCreateTables() { Table mockTable = mock(Table.class); when(bigQuery.getTable(any())).thenReturn(mockTable); - Map> emptyMap = mock(Map.class); - when(emptyMap.isEmpty()).thenReturn(true); InsertAllResponse insertAllResponse = mock(InsertAllResponse.class); when(insertAllResponse.hasErrors()).thenReturn(false); - when(insertAllResponse.getInsertErrors()).thenReturn(emptyMap); + when(insertAllResponse.getInsertErrors()).thenReturn(Collections.emptyMap()); - BigQueryException missTableException = mock(BigQueryException.class); - when(missTableException.getCode()).thenReturn(404); + BigQueryException missTableException = new BigQueryException(404, "Table is missing"); when(bigQuery.insertAll(anyObject())).thenThrow(missTableException).thenReturn(insertAllResponse); @@ -168,15 +170,14 @@ public void testNonAutoCreateTables() { Storage storage = mock(Storage.class); SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + Map cache = new HashMap<>(); + + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put( Collections.singletonList(spoofSinkRecord(topic, 0, 0, "some_field", "some_value"))); testTask.flush(Collections.emptyMap()); - - verify(schemaManager, times(0)).createTable(anyObject(), anyObject()); - verify(bigQuery, times(2)).insertAll(anyObject()); } @Test @@ -184,24 +185,16 @@ public void testBigQueryPartialFailure() { final String topic = "test_topic"; final String dataset = "scratch"; final Map properties = makeProperties("3", "2000", topic, dataset); - final Set failedRowSet = new HashSet<>(); - failedRowSet.add(1L); - - Map> insertErrorMap = mock(Map.class); - when(insertErrorMap.isEmpty()).thenReturn(false); - when(insertErrorMap.size()).thenReturn(1); - when(insertErrorMap.keySet()).thenReturn(failedRowSet); + BigQueryError insertError = new BigQueryError("reason", "location", "message"); + Map> insertErrorMap = Collections.singletonMap(1L, Collections.singletonList(insertError)); InsertAllResponse insertAllResponseWithError = mock(InsertAllResponse.class); when(insertAllResponseWithError.hasErrors()).thenReturn(true); when(insertAllResponseWithError.getInsertErrors()).thenReturn(insertErrorMap); - Map> emptyMap = mock(Map.class); - when(emptyMap.isEmpty()).thenReturn(true); - InsertAllResponse insertAllResponseNoError = mock(InsertAllResponse.class); when(insertAllResponseNoError.hasErrors()).thenReturn(true); - when(insertAllResponseNoError.getInsertErrors()).thenReturn(emptyMap); + when(insertAllResponseNoError.getInsertErrors()).thenReturn(Collections.emptyMap()); BigQuery bigQuery = mock(BigQuery.class); Table mockTable = mock(Table.class); @@ -220,9 +213,10 @@ public void testBigQueryPartialFailure() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); - + Map cache = new HashMap<>(); Storage storage = mock(Storage.class); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put(sinkRecordList); @@ -237,26 +231,24 @@ public void testBigQueryPartialFailure() { assertEquals("test_topic-1-1", varArgs.getAllValues().get(1).getRows().get(0).getId()); } - @Test(expected = BigQueryConnectException.class) + @Test public void testBigQueryCompleteFailure() { final String topic = "test_topic"; final String dataset = "scratch"; final Map properties = makeProperties("3", "2000", topic, dataset); + BigQueryError insertError = new BigQueryError("reason", "location", "message"); - Map> insertErrorMap = mock(Map.class); - when(insertErrorMap.isEmpty()).thenReturn(false); - when(insertErrorMap.size()).thenReturn(2); + Map> insertErrorMap = new HashMap<>(); + insertErrorMap.put(1L, Collections.singletonList(insertError)); + insertErrorMap.put(2L, Collections.singletonList(insertError)); InsertAllResponse insertAllResponseWithError = mock(InsertAllResponse.class); when(insertAllResponseWithError.hasErrors()).thenReturn(true); when(insertAllResponseWithError.getInsertErrors()).thenReturn(insertErrorMap); - Map> emptyMap = mock(Map.class); - when(emptyMap.isEmpty()).thenReturn(true); - InsertAllResponse insertAllResponseNoError = mock(InsertAllResponse.class); when(insertAllResponseNoError.hasErrors()).thenReturn(true); - when(insertAllResponseNoError.getInsertErrors()).thenReturn(emptyMap); + when(insertAllResponseNoError.getInsertErrors()).thenReturn(Collections.emptyMap()); BigQuery bigQuery = mock(BigQuery.class); Table mockTable = mock(Table.class); @@ -274,15 +266,17 @@ public void testBigQueryCompleteFailure() { SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); - + Map cache = new HashMap<>(); Storage storage = mock(Storage.class); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put(sinkRecordList); - testTask.flush(Collections.emptyMap()); + Exception expectedEx = assertThrows(BigQueryConnectException.class, + () -> testTask.flush(Collections.emptyMap())); + assertTrue(expectedEx.getCause().getMessage().contains("test_topic")); } - /** * Utility method for making and retrieving properties based on provided parameters. * @param bigqueryRetry The number of retries. @@ -296,10 +290,11 @@ private Map makeProperties(String bigqueryRetry, String topic, String dataset) { Map properties = propertiesFactory.getProperties(); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_RETRY_CONFIG, bigqueryRetry); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_RETRY_WAIT_CONFIG, bigqueryRetryWait); + properties.put(BigQuerySinkConfig.BIGQUERY_RETRY_CONFIG, bigqueryRetry); + properties.put(BigQuerySinkConfig.BIGQUERY_RETRY_WAIT_CONFIG, bigqueryRetryWait); properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, dataset); + properties.put(BigQuerySinkTaskConfig.TASK_ID_CONFIG, "6"); return properties; } diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/write/row/GCSToBQWriterTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/write/row/GCSToBQWriterTest.java index 435328b55..d81c48717 100644 --- a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/write/row/GCSToBQWriterTest.java +++ b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/write/row/GCSToBQWriterTest.java @@ -1,7 +1,7 @@ -package com.wepay.kafka.connect.bigquery.write.row; - /* - * Copyright 2019 WePay, Inc. + * Copyright 2020 Confluent, Inc. + * + * This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,16 +17,17 @@ * under the License. */ +package com.wepay.kafka.connect.bigquery.write.row; import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.Table; -import com.google.cloud.storage.Blob; +import com.google.cloud.bigquery.TableId; import com.google.cloud.storage.BlobInfo; import com.google.cloud.storage.Storage; import com.google.cloud.storage.StorageException; import com.wepay.kafka.connect.bigquery.BigQuerySinkTask; import com.wepay.kafka.connect.bigquery.SchemaManager; -import com.wepay.kafka.connect.bigquery.SinkTaskPropertiesFactory; +import com.wepay.kafka.connect.bigquery.SinkPropertiesFactory; import com.wepay.kafka.connect.bigquery.api.SchemaRetriever; import com.wepay.kafka.connect.bigquery.config.BigQuerySinkConfig; import com.wepay.kafka.connect.bigquery.config.BigQuerySinkTaskConfig; @@ -41,6 +42,7 @@ import org.junit.Test; import java.util.Collections; +import java.util.HashMap; import java.util.Map; import static org.mockito.Matchers.anyObject; @@ -51,11 +53,11 @@ public class GCSToBQWriterTest { - private static SinkTaskPropertiesFactory propertiesFactory; + private static SinkPropertiesFactory propertiesFactory; @BeforeClass public static void initializePropertiesFactory() { - propertiesFactory = new SinkTaskPropertiesFactory(); + propertiesFactory = new SinkPropertiesFactory(); } @Test @@ -72,8 +74,9 @@ public void testGCSNoFailure(){ SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put( @@ -97,12 +100,13 @@ public void testGCSSomeFailures(){ SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); when(storage.create((BlobInfo)anyObject(), (byte[])anyObject())) .thenThrow(new StorageException(500, "internal server error")) // throw first time .thenReturn(null); // return second time. (we don't care about the result.) - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put( @@ -126,11 +130,12 @@ public void testGCSAllFailures(){ SchemaRetriever schemaRetriever = mock(SchemaRetriever.class); SchemaManager schemaManager = mock(SchemaManager.class); + Map cache = new HashMap<>(); when(storage.create((BlobInfo)anyObject(), (byte[])anyObject())) .thenThrow(new StorageException(500, "internal server error")); - BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager); + BigQuerySinkTask testTask = new BigQuerySinkTask(bigQuery, schemaRetriever, storage, schemaManager, cache); testTask.initialize(sinkTaskContext); testTask.start(properties); testTask.put( @@ -161,10 +166,11 @@ private Map makeProperties(String bigqueryRetry, String topic, String dataset) { Map properties = propertiesFactory.getProperties(); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_RETRY_CONFIG, bigqueryRetry); - properties.put(BigQuerySinkTaskConfig.BIGQUERY_RETRY_WAIT_CONFIG, bigqueryRetryWait); + properties.put(BigQuerySinkConfig.BIGQUERY_RETRY_CONFIG, bigqueryRetry); + properties.put(BigQuerySinkConfig.BIGQUERY_RETRY_WAIT_CONFIG, bigqueryRetryWait); properties.put(BigQuerySinkConfig.TOPICS_CONFIG, topic); properties.put(BigQuerySinkConfig.DEFAULT_DATASET_CONFIG, dataset); + properties.put(BigQuerySinkTaskConfig.TASK_ID_CONFIG, "9"); // gcs config properties.put(BigQuerySinkConfig.ENABLE_BATCH_CONFIG, topic); properties.put(BigQuerySinkConfig.GCS_BUCKET_NAME_CONFIG, "myBucket"); diff --git a/kcbq-connector/test/resources/test_schemas/gcs-load/data.json b/kcbq-connector/src/test/resources/integration_test_cases/gcs-load/data.json similarity index 100% rename from kcbq-connector/test/resources/test_schemas/gcs-load/data.json rename to kcbq-connector/src/test/resources/integration_test_cases/gcs-load/data.json diff --git a/kcbq-connector/test/resources/test_schemas/gcs-load/schema.json b/kcbq-connector/src/test/resources/integration_test_cases/gcs-load/schema.json similarity index 100% rename from kcbq-connector/test/resources/test_schemas/gcs-load/schema.json rename to kcbq-connector/src/test/resources/integration_test_cases/gcs-load/schema.json diff --git a/kcbq-connector/test/resources/test_schemas/logical-types/data.json b/kcbq-connector/src/test/resources/integration_test_cases/logical-types/data.json similarity index 100% rename from kcbq-connector/test/resources/test_schemas/logical-types/data.json rename to kcbq-connector/src/test/resources/integration_test_cases/logical-types/data.json diff --git a/kcbq-connector/test/resources/test_schemas/logical-types/schema.json b/kcbq-connector/src/test/resources/integration_test_cases/logical-types/schema.json similarity index 100% rename from kcbq-connector/test/resources/test_schemas/logical-types/schema.json rename to kcbq-connector/src/test/resources/integration_test_cases/logical-types/schema.json diff --git a/kcbq-connector/test/resources/test_schemas/matryoshka-dolls/data.json b/kcbq-connector/src/test/resources/integration_test_cases/matryoshka-dolls/data.json similarity index 100% rename from kcbq-connector/test/resources/test_schemas/matryoshka-dolls/data.json rename to kcbq-connector/src/test/resources/integration_test_cases/matryoshka-dolls/data.json diff --git a/kcbq-connector/test/resources/test_schemas/matryoshka-dolls/schema.json b/kcbq-connector/src/test/resources/integration_test_cases/matryoshka-dolls/schema.json similarity index 100% rename from kcbq-connector/test/resources/test_schemas/matryoshka-dolls/schema.json rename to kcbq-connector/src/test/resources/integration_test_cases/matryoshka-dolls/schema.json diff --git a/kcbq-connector/test/resources/test_schemas/nulls/data.json b/kcbq-connector/src/test/resources/integration_test_cases/nulls/data.json similarity index 100% rename from kcbq-connector/test/resources/test_schemas/nulls/data.json rename to kcbq-connector/src/test/resources/integration_test_cases/nulls/data.json diff --git a/kcbq-connector/test/resources/test_schemas/nulls/schema.json b/kcbq-connector/src/test/resources/integration_test_cases/nulls/schema.json similarity index 100% rename from kcbq-connector/test/resources/test_schemas/nulls/schema.json rename to kcbq-connector/src/test/resources/integration_test_cases/nulls/schema.json diff --git a/kcbq-connector/test/resources/test_schemas/primitives/data.json b/kcbq-connector/src/test/resources/integration_test_cases/primitives/data.json similarity index 100% rename from kcbq-connector/test/resources/test_schemas/primitives/data.json rename to kcbq-connector/src/test/resources/integration_test_cases/primitives/data.json diff --git a/kcbq-connector/test/resources/test_schemas/primitives/schema.json b/kcbq-connector/src/test/resources/integration_test_cases/primitives/schema.json similarity index 100% rename from kcbq-connector/test/resources/test_schemas/primitives/schema.json rename to kcbq-connector/src/test/resources/integration_test_cases/primitives/schema.json diff --git a/kcbq-connector/src/test/resources/log4j.properties b/kcbq-connector/src/test/resources/log4j.properties new file mode 100644 index 000000000..8a383ba87 --- /dev/null +++ b/kcbq-connector/src/test/resources/log4j.properties @@ -0,0 +1,42 @@ +# +# Copyright 2020 Confluent, Inc. +# +# This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +log4j.rootLogger=INFO, stdout + +# Send the logs to the console. +# +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout + +connect.log.pattern=[%d] %p %X{connector.context}%m (%c:%L)%n +log4j.appender.stdout.layout.ConversionPattern=${connect.log.pattern} +log4j.appender.connectAppender.layout.ConversionPattern=${connect.log.pattern} + +# These are used in the log4j properties file that ships by default with Connect +log4j.logger.org.apache.zookeeper=ERROR +log4j.logger.org.reflections=ERROR + +log4j.logger.com.wepay.kafka.connect.bigquery=DEBUG + +# We see a lot of WARN-level messages from this class when a table is created by the connector and +# then written to shortly after. No need for that much noise during routine tests +log4j.logger.com.wepay.kafka.connect.bigquery.write.batch.TableWriter=ERROR +# Logs a message at INFO on every http request +log4j.logger.org.apache.kafka.connect.util.clusters.EmbeddedConnectCluster=WARN +log4j.logger.com.wepay.kafka.connect.bigquery.integration.BigQueryErrorResponsesIT=DEBUG diff --git a/kcbq-connector/test/docker/connect/Dockerfile b/kcbq-connector/test/docker/connect/Dockerfile deleted file mode 100644 index 447e6e9a9..000000000 --- a/kcbq-connector/test/docker/connect/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2016 WePay, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# Builds a docker image for the Kafka-BigQuery Connector. -# Expects links to "kafka" and "schema-registry" containers. -# -# Usage: -# docker build -t kcbq/connect connect -# docker run --name kcbq_test_connect \ -# --link kcbq_test_kafka:kafka --link kcbq_test_schema-registry:schema-registry \ -# kcbq/connect - -FROM confluentinc/cp-kafka-connect-base:4.1.2 - -COPY connect-docker.sh /usr/local/bin/ - -RUN ["chmod", "+x", "/usr/local/bin/connect-docker.sh"] - -RUN ["mkdir", "/usr/logs"] -RUN ["chmod", "a+rwx", "/usr/logs"] - -RUN ["mkdir", "-p", "/usr/local/share/kafka/plugins/kafka-connect-bigquery"] -RUN ["chmod", "a+rwx", "/usr/local/share/kafka/plugins/kafka-connect-bigquery"] - -USER root -ENTRYPOINT ["/usr/local/bin/connect-docker.sh"] diff --git a/kcbq-connector/test/docker/connect/connect-docker.sh b/kcbq-connector/test/docker/connect/connect-docker.sh deleted file mode 100755 index 65c2bd606..000000000 --- a/kcbq-connector/test/docker/connect/connect-docker.sh +++ /dev/null @@ -1,26 +0,0 @@ -#! /usr/bin/env bash -# Copyright 2016 WePay, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -tar -C /usr/local/share/kafka/plugins/kafka-connect-bigquery/ -xf /usr/local/share/kafka/plugins/kafka-connect-bigquery/kcbq.tar - -connect-standalone \ - /etc/kafka-connect-bigquery/standalone.properties \ - /etc/kafka-connect-bigquery/connector.properties & - -# Time (seconds) to wait for the process for inserting rows into BigQuery to be done. -# This time can be adjusted if necessary. -sleep 180 -kill $! diff --git a/kcbq-connector/test/docker/populate/Dockerfile b/kcbq-connector/test/docker/populate/Dockerfile deleted file mode 100644 index c91dcecf6..000000000 --- a/kcbq-connector/test/docker/populate/Dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2016 WePay, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# Populates Kafka and Schema Registry with test data -# Expects links to "kafka" and "schema-registry" containers. -# -# Usage: -# docker build -t kcbq/populate populate -# docker run --name kcbq_test_populate \ -# --link kcbq_test_kafka:kafka --link kcbq_test_schema-registry:schema-registry \ -# kcbq/populate - -FROM confluentinc/cp-schema-registry:4.1.2 - -COPY populate-docker.sh /usr/local/bin/ - -RUN ["chmod", "+x", "/usr/local/bin/populate-docker.sh"] - -USER root -ENTRYPOINT ["/usr/local/bin/populate-docker.sh"] diff --git a/kcbq-connector/test/docker/populate/populate-docker.sh b/kcbq-connector/test/docker/populate/populate-docker.sh deleted file mode 100755 index aed681140..000000000 --- a/kcbq-connector/test/docker/populate/populate-docker.sh +++ /dev/null @@ -1,24 +0,0 @@ -#! /usr/bin/env bash -# Copyright 2016 WePay, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -for schema_dir in /tmp/schemas/*; do - kafka-avro-console-producer \ - --topic "kcbq_test_`basename $schema_dir`" \ - --broker-list 'kafka:29092' \ - --property value.schema="`cat \"$schema_dir/schema.json\"`" \ - --property schema.registry.url='http://schema-registry:8081' \ - < "$schema_dir/data.json" -done diff --git a/kcbq-connector/test/integrationtest.sh b/kcbq-connector/test/integrationtest.sh deleted file mode 100755 index a88c5518b..000000000 --- a/kcbq-connector/test/integrationtest.sh +++ /dev/null @@ -1,293 +0,0 @@ -#! /usr/bin/env bash -# Copyright 2016 WePay, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -#################################################################################################### -# Basic script setup - -set -e - -if [[ -t 1 ]]; then - NORMAL="$(tput sgr0)" - BOLD="$(tput bold)" - RED="$(tput setaf 1)" - GREEN="$(tput setaf 2)" - YELLOW="$(tput setaf 3)" -else - unset NORMAL BOLD RED GREEN YELLOW -fi - -usage() { - echo -e "usage: $0\n" \ - "[-k|--key-file ]\n" \ - "[-k|--key-source ] (path must be absolute; relative paths will not work)\n" \ - "[-p|--project ]\n" \ - "[-d|--dataset ]\n" \ - "[-b|--bucket \n]" \ - "[-f|--folder \n]" \ - 1>&2 - echo 1>&2 - echo "Options can also be specified via environment variable:" \ - "KCBQ_TEST_KEYFILE, KCBQ_TEST_PROJECT, KCBQ_TEST_DATASET, KCBQ_TEST_BUCKET, and KCBQ_TEST_FOLDER" \ - "respectively control the keyfile, project, dataset, and bucket." \ - 1>&2 - echo 1>&2 - echo "Options can also be specified in a file named 'test.conf'" \ - "placed in the same directory as this script, with a series of = lines." \ - "The properties are 'keyfile', 'project', 'dataset', and 'bucket'." \ - 1>&2 - echo 1>&2 - echo "The descending order of priority for each of these forms of specification is:" \ - "command line option, environment variable, configuration file." \ - 1>&2 - # Accept an optional exit value parameter - exit ${1:-0} -} - -msg() { printf "$1%s: $2$NORMAL\n" "$(basename $0)"; } -error() { msg "$RED" "$*"; exit 1; } >&2 -warn() { msg "$YELLOW" "$*"; } >&2 -statusupdate() { msg "$GREEN" "$*"; } -log() { msg "$BOLD" "$*"; } - - -BASE_DIR=$(dirname "$0") -GRADLEW="$BASE_DIR/../../gradlew" - -#################################################################################################### -# Configuration processing - -# Read in properties file, if it exists and can be read -PROPERTIES_FILE="$BASE_DIR/test.conf" -[[ -f "$PROPERTIES_FILE" ]] && [[ -r "$PROPERTIES_FILE" ]] && source "$PROPERTIES_FILE" - -# Copy the file's properties into actual test variables, -# without overriding any that have already been specified -KCBQ_TEST_KEYFILE=${KCBQ_TEST_KEYFILE:-$keyfile} -KCBQ_TEST_PROJECT=${KCBQ_TEST_PROJECT:-$project} -KCBQ_TEST_DATASET=${KCBQ_TEST_DATASET:-$dataset} -KCBQ_TEST_BUCKET=${KCBQ_TEST_BUCKET:-$bucket} -KCBQ_TEST_FOLDER=${KCBQ_TEST_FOLDER:-$folder} -KCBQ_TEST_KEYSOURCE=${KCBQ_TEST_KEYSOURCE:-$keysource} - -# Capture any command line flags -while [[ $# -gt 0 ]]; do - case "$1" in - -k|--key-file) - [[ -z "$2" ]] && { error "key filename must follow $1 flag"; usage 1; } - shift - KCBQ_TEST_KEYFILE="$1" - ;; - -p|--project) - [[ -z "$2" ]] && { error "project name must follow $1 flag"; usage 1; } - shift - KCBQ_TEST_PROJECT="$1" - ;; - -d|--dataset) - [[ -z "$2" ]] && { error "dataset name must follow $1 flag"; usage 1; } - shift - KCBQ_TEST_DATASET="$1" - ;; - -b|--bucket) - [[ -z "$2" ]] && { error "bucket name must follow $1 flag"; usage 1; } - shift - KCBQ_TEST_BUCKET="$1" - ;; - -b|--folder) - [[ -z "$2" ]] && { error "folder name must follow $1 flag"; usage 1; } - shift - KCBQ_TEST_FOLDER="$1" - ;; - -h|--help|'-?') - usage 0 - ;; - -kf|--key-source) - [[ -z "$2" ]] && { error "key filename must follow $1 flag"; usage 1; } - shift - KCBQ_TEST_KEYSOURCE="$1" - ;; - *) - error "unrecognized option: '$1'"; usage 1 - ;; - esac - shift -done - -# Make sure required arguments have been provided one way or another -[[ -z "$KCBQ_TEST_KEYFILE" ]] && { error 'a key filename is required'; usage 1; } -[[ -z "$KCBQ_TEST_PROJECT" ]] && { error 'a project name is required'; usage 1; } -[[ -z "$KCBQ_TEST_DATASET" ]] && { error 'a dataset name is required'; usage 1; } -[[ -z "$KCBQ_TEST_BUCKET" ]] && { error 'a bucket name is required'; usage 1; } - -#################################################################################################### -# Schema Registry Docker initialization - -if echo | xargs --no-run-if-empty; then - xargs() { command xargs --no-run-if-empty "$@"; } -else - xargs() { command xargs "$@"; } -fi 2> /dev/null - - -dockercleanup() { - log 'Cleaning up leftover Docker containers' - docker ps -aq -f 'name=kcbq_test_(zookeeper|kafka|schema-registry|populate|connect)' \ - | xargs docker rm -f > /dev/null -} - -dockerimageexists() { - docker images --format '{{ .Repository }}' | grep -q "$1" -} - -# Cleanup these on exit in case something goes wrong -trap dockercleanup EXIT -# And remove any that are still around right now -dockercleanup - -DOCKER_DIR="$BASE_DIR/docker" - -ZOOKEEPER_DOCKER_NAME='kcbq_test_zookeeper' -KAFKA_DOCKER_NAME='kcbq_test_kafka' -SCHEMA_REGISTRY_DOCKER_NAME='kcbq_test_schema-registry' - -statusupdate 'Creating Zookeeper Docker instance' -docker run --name "$ZOOKEEPER_DOCKER_NAME" \ - -d \ - -e ZOOKEEPER_CLIENT_PORT=32181 \ - confluentinc/cp-zookeeper:4.1.2 - -statusupdate 'Creating Kafka Docker instance' -docker run --name "$KAFKA_DOCKER_NAME" \ - --link "$ZOOKEEPER_DOCKER_NAME":zookeeper \ - --add-host kafka:127.0.0.1 \ - -d \ - -e KAFKA_ZOOKEEPER_CONNECT=zookeeper:32181 \ - -e KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka:29092 \ - -e KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1 \ - confluentinc/cp-kafka:4.1.2 - -statusupdate 'Creating Schema Registry Docker instance' -# Have to pause here to make sure Zookeeper/Kafka get on their feet first -sleep 5 -docker run --name "$SCHEMA_REGISTRY_DOCKER_NAME" \ - --link "$ZOOKEEPER_DOCKER_NAME":zookeeper --link "$KAFKA_DOCKER_NAME":kafka \ - --env SCHEMA_REGISTRY_AVRO_COMPATIBILITY_LEVEL=none \ - -d \ - -e SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL=zookeeper:32181 \ - -e SCHEMA_REGISTRY_HOST_NAME=schema-registry \ - confluentinc/cp-schema-registry:4.1.2 - -#################################################################################################### -# Writing data to Kafka Docker instance via Avro console producer -statusupdate 'Populating Kafka/Schema Registry Docker instances with test data' - -POPULATE_DOCKER_IMAGE='kcbq/populate' -POPULATE_DOCKER_NAME='kcbq_test_populate' - -if ! dockerimageexists "$POPULATE_DOCKER_IMAGE"; then - docker build -q -t "$POPULATE_DOCKER_IMAGE" "$DOCKER_DIR/populate" -fi -# Have to pause here to make sure the Schema Registry gets on its feet first -sleep 35 -docker create --name "$POPULATE_DOCKER_NAME" \ - --link "$KAFKA_DOCKER_NAME:kafka" --link "$SCHEMA_REGISTRY_DOCKER_NAME:schema-registry" \ - "$POPULATE_DOCKER_IMAGE" -docker cp "$BASE_DIR/resources/test_schemas/" "$POPULATE_DOCKER_NAME:/tmp/schemas/" -docker start -a "$POPULATE_DOCKER_NAME" - -#################################################################################################### -# Deleting existing BigQuery tables/bucket -warn 'Deleting existing BigQuery test tables and existing GCS bucket' - - -test_tables= -test_topics= -for file in "$BASE_DIR"/resources/test_schemas/*; do - test_tables+="${test_tables:+ }kcbq_test_$(basename "${file/-/_}")" - test_topics+="${test_topics:+,}kcbq_test_$(basename "$file")" -done - -"$GRADLEW" -p "$BASE_DIR/.." \ - -Pkcbq_test_keyfile="$KCBQ_TEST_KEYFILE" \ - -Pkcbq_test_project="$KCBQ_TEST_PROJECT" \ - -Pkcbq_test_dataset="$KCBQ_TEST_DATASET" \ - -Pkcbq_test_tables="$test_tables" \ - -Pkcbq_test_bucket="$KCBQ_TEST_BUCKET" \ - -Pkcbq_test_keysource="$KCBQ_TEST_KEYSOURCE" \ - integrationTestPrep - -#################################################################################################### -# Executing connector in standalone mode (this is the execution portion of the actual test) -statusupdate 'Executing Kafka Connect in Docker' - -# Run clean task to ensure there's only one connector tarball in the build/dist directory -"$GRADLEW" -q -p "$BASE_DIR/../.." clean distTar - -[[ ! -e "$DOCKER_DIR/connect/properties" ]] && mkdir "$DOCKER_DIR/connect/properties" -RESOURCES_DIR="$BASE_DIR/resources" - -STANDALONE_PROPS="$DOCKER_DIR/connect/properties/standalone.properties" -cp "$RESOURCES_DIR/standalone-template.properties" "$STANDALONE_PROPS" - -CONNECTOR_PROPS="$DOCKER_DIR/connect/properties/connector.properties" -cp "$RESOURCES_DIR/connector-template.properties" "$CONNECTOR_PROPS" -cat << EOF >> $CONNECTOR_PROPS -project=$KCBQ_TEST_PROJECT -datasets=.*=$KCBQ_TEST_DATASET -gcsBucketName=$KCBQ_TEST_BUCKET -gcsFolderName=$KCBQ_TEST_FOLDER -topics=$test_topics - -EOF - -CONNECT_DOCKER_IMAGE='kcbq/connect' -CONNECT_DOCKER_NAME='kcbq_test_connect' - -cp "$BASE_DIR"/../../kcbq-confluent/build/distributions/kcbq-confluent-*.tar "$DOCKER_DIR/connect/kcbq.tar" -if [[ "$KCBQ_TEST_KEYSOURCE" == "JSON" ]]; then - echo "$KCBQ_TEST_KEYFILE" > "$DOCKER_DIR/connect/key.json" -else - cp "$KCBQ_TEST_KEYFILE" "$DOCKER_DIR/connect/key.json" -fi - -if ! dockerimageexists "$CONNECT_DOCKER_IMAGE"; then - docker build -q -t "$CONNECT_DOCKER_IMAGE" "$DOCKER_DIR/connect" -fi -docker create --name "$CONNECT_DOCKER_NAME" \ - --link "$KAFKA_DOCKER_NAME:kafka" --link "$SCHEMA_REGISTRY_DOCKER_NAME:schema-registry" \ - -t "$CONNECT_DOCKER_IMAGE" /bin/bash -docker cp "$DOCKER_DIR/connect/kcbq.tar" "$CONNECT_DOCKER_NAME:/usr/local/share/kafka/plugins/kafka-connect-bigquery/kcbq.tar" -docker cp "$DOCKER_DIR/connect/properties/" "$CONNECT_DOCKER_NAME:/etc/kafka-connect-bigquery/" -docker cp "$DOCKER_DIR/connect/key.json" "$CONNECT_DOCKER_NAME:/tmp/key.json" -docker start -a "$CONNECT_DOCKER_NAME" - -#################################################################################################### -# Checking on BigQuery data via Java test (this is the verification portion of the actual test) -statusupdate 'Verifying that test data made it successfully to BigQuery' - -INTEGRATION_TEST_RESOURCE_DIR="$BASE_DIR/../src/integration-test/resources" -[[ ! -d "$INTEGRATION_TEST_RESOURCE_DIR" ]] && mkdir -p "$INTEGRATION_TEST_RESOURCE_DIR" - -cat << EOF > "$INTEGRATION_TEST_RESOURCE_DIR/test.properties" -keyfile=$KCBQ_TEST_KEYFILE -project=$KCBQ_TEST_PROJECT -dataset=$KCBQ_TEST_DATASET -bucket=$KCBQ_TEST_BUCKET -folder=$KCBQ_TEST_FOLDER -keysource=$KCBQ_TEST_KEYSOURCE -EOF - - -"$GRADLEW" -p "$BASE_DIR/.." cleanIntegrationTest integrationTest diff --git a/kcbq-connector/test/resources/connector-template.properties b/kcbq-connector/test/resources/connector-template.properties deleted file mode 100644 index a8aa8e766..000000000 --- a/kcbq-connector/test/resources/connector-template.properties +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2016 WePay, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name=bigquery-connector -connector.class=com.wepay.kafka.connect.bigquery.BigQuerySinkConnector -tasks.max=1 -defaultDataset= -autoUpdateSchemas=true - -sanitizeTopics=true - -bufferSize=100000 -maxWriteSize=1000 -tableWriteWait=1000 - -keyfile=/tmp/key.json -schemaRetriever=com.wepay.kafka.connect.bigquery.retrieve.IdentitySchemaRetriever - -enableBatchLoad=kcbq_test_gcs-load -batchLoadIntervalSec=10 - -# An example regex router SMT that strips (kcbq_) from the topic name. -# Replace with relevant regex to replace the topic of each sink record with -# destination dataset and table name in the format : or only the destination -# table name in the format -transforms=RegexTransformation -transforms.RegexTransformation.type=org.apache.kafka.connect.transforms.RegexRouter -transforms.RegexTransformation.regex=(kcbq_)(.*) -transforms.RegexTransformation.replacement=$2 diff --git a/kcbq-connector/test/resources/standalone-template.properties b/kcbq-connector/test/resources/standalone-template.properties deleted file mode 100644 index cb9998503..000000000 --- a/kcbq-connector/test/resources/standalone-template.properties +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright 2016 WePay, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -bootstrap.servers=kafka:29092 -key.converter=io.confluent.connect.avro.AvroConverter -key.converter.schema.registry.url=http://schema-registry:8081 -value.converter=io.confluent.connect.avro.AvroConverter -value.converter.schema.registry.url=http://schema-registry:8081 -internal.key.converter=org.apache.kafka.connect.json.JsonConverter -internal.value.converter=org.apache.kafka.connect.json.JsonConverter -internal.key.converter.schemas.enable=false -internal.value.converter.schemas.enable=false -offset.storage.file.filename=/tmp/connect.offsets -offset.flush.interval.ms=10000 -plugin.path=/usr/local/share/kafka/plugins diff --git a/pom.xml b/pom.xml new file mode 100644 index 000000000..402e32f0f --- /dev/null +++ b/pom.xml @@ -0,0 +1,472 @@ + + + + 4.0.0 + + com.wepay.kcbq + kcbq-parent + 2.5.0-SNAPSHOT + pom + + + kcbq-api + kcbq-connector + + + + 8 + + 5.5.1 + 0.6.1 + 0.21.1 + 2.10.9 + 1.113.4 + 2.10.2 + 2.5.0 + 2.12 + 1.7.26 + 2.8.6 + + 4.13 + 3.2.4 + + 1.4 + 2.15 + 6.18 + 3.8.1 + 0.8.5 + 0.11.1 + 2.5.3 + 3.7.1 + 3.0.0-M4 + + ${project.basedir} + ${maven.test.skip} + + + kafka-connect-bigquery-parent + + https://github.com/confluentinc/kafka-connect-bigquery + + 2016 + + + + Apache License 2.0 + https://www.apache.org/licenses/LICENSE-2.0 + repo + + + + + scm:git:git://github.com/confluentinc/kafka-connect-bigquery.git + scm:git:git@github.com:confluentinc/kafka-connect-bigquery.git + https://github.com/confluentinc/kafka-connect-bigquery + HEAD + + + + + C0urante + Chris Egerton + fearthecellos@gmail.com + America/New_York + + + moirat + Moira Tagle + moirat@wepay.com + America/Los_Angeles + + + + + + confluent + https://packages.confluent.io/maven/ + + + jcenter + https://jcenter.bintray.com + + + jitpack.io + https://jitpack.io + + + + + + confluent + https://packages.confluent.io/maven/ + + + jcenter + https://jcenter.bintray.com + + + jitpack.io + https://jitpack.io + + + + + + + + com.wepay.kcbq + kcbq-api + ${project.version} + + + + + org.apache.kafka + connect-api + ${kafka.version} + provided + + + org.apache.kafka + kafka-clients + ${kafka.version} + provided + + + + com.google.cloud + google-cloud-bigquery + ${google.cloud.version} + + + com.google.cloud + google-cloud-storage + ${google.cloud.storage.version} + + + com.google.auth + google-auth-library-oauth2-http + ${google.auth.version} + + + org.slf4j + slf4j-api + ${slf4j.version} + + + io.debezium + debezium-core + ${debezium.version} + + + + + com.fasterxml.jackson.core + jackson-core + ${jackson.version} + test + + + junit + junit + ${junit.version} + test + + + org.mockito + mockito-core + ${mockito.version} + test + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + test + + + org.apache.kafka + kafka_${kafka.scala.version} + ${kafka.version} + test + + + org.apache.kafka + kafka_${kafka.scala.version} + ${kafka.version} + test + test-jar + test + + + org.apache.kafka + kafka-clients + ${kafka.version} + test + test-jar + test + + + org.apache.kafka + connect-runtime + ${kafka.version} + test + + + org.apache.kafka + connect-runtime + ${kafka.version} + test + test-jar + test + + + io.confluent + kafka-schema-registry + ${confluent.version} + test + + + io.confluent + kafka-schema-registry + ${confluent.version} + tests + test-jar + test + + + io.confluent + kafka-connect-avro-converter + ${confluent.version} + test + + + io.confluent + kafka-avro-serializer + ${confluent.version} + test + + + + + + + + org.apache.maven.plugins + maven-release-plugin + ${release.plugin.version} + + true + false + v@{project.version} + + + + com.mycila + license-maven-plugin + 3.0 + + +Copyright 2020 Confluent, Inc. + +This software contains code derived from the WePay BigQuery Kafka Connector, Copyright WePay, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. + + + ${main.dir}/config/copyright/custom-header-styles.xml + + + CUSTOM_JAVA_STYLE + JENKINSFILE_STYLE + + + LICENSE.md + *.log + config/checkstyle/google_checks.xml + + + .ci/* + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + ${compiler.plugin.version} + + ${java.version} + ${java.version} + + + + org.apache.maven.plugins + maven-surefire-plugin + ${surefire.plugin.version} + + org.apache.kafka.test.IntegrationTest + ${skip.unit.tests} + + + + org.apache.maven.plugins + maven-failsafe-plugin + ${surefire.plugin.version} + + + embedded-integration-test + + integration-test + + + org.apache.kafka.test.IntegrationTest + + + + + + org.jacoco + jacoco-maven-plugin + ${jacoco.plugin.version} + + + pre-unit-test + + prepare-agent + + + + report + verify + + report + + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + ${checkstyle.plugin.version} + + + validate + validate + + ${project.parent.basedir}/config/checkstyle/google_checks.xml + ${project.parent.basedir}/config/checkstyle/suppressions.xml + + + check + + + + + + com.puppycrawl.tools + checkstyle + ${checkstyle.version} + + + + + + org.apache.maven.plugins + maven-site-plugin + ${site.plugin.version} + + + io.confluent + kafka-connect-maven-plugin + ${kafka.connect.plugin.version} + + + + + + + jenkins + + + + org.codehaus.mojo + buildnumber-maven-plugin + ${buildnumber.plugin.version} + + + generate-test-suffix + + create + + pre-integration-test + + + + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + + true + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + connect-205118 + jenkinsKcbqIntegrationTesting + jenkins-kcbq-integration-testing + -${scmBranch}-${buildNumber}-${timestamp} + ${scmBranch}-${buildNumber}-${timestamp} + + + + + + + + + diff --git a/settings.gradle b/settings.gradle deleted file mode 100644 index a24368b32..000000000 --- a/settings.gradle +++ /dev/null @@ -1 +0,0 @@ -include 'kcbq-connector', 'kcbq-api', 'kcbq-confluent'