diff --git a/.github/workflows/build-extension.yml b/.github/workflows/build-extension.yml index cb8b6784a8cf8f..4125d692f540d3 100644 --- a/.github/workflows/build-extension.yml +++ b/.github/workflows/build-extension.yml @@ -40,6 +40,7 @@ jobs: outputs: broker_changes: ${{ steps.filter.outputs.broker_changes }} docs_changes: ${{ steps.filter.outputs.docs_changes }} + cdc_client_changes: ${{ steps.filter.outputs.cdc_client_changes }} steps: - name: Checkout ${{ github.ref }} uses: actions/checkout@v3 @@ -53,9 +54,11 @@ jobs: with: filters: | broker_changes: - - 'fs_brokers/**' + - 'fs_brokers/apache_hdfs_broker/**' docs_changes: - 'docs/**' + cdc_client_changes: + - 'fs_brokers/cdc_client/**' build-broker: name: Build Broker needs: changes @@ -92,6 +95,41 @@ jobs: - name: Build broker run: | cd fs_brokers/apache_hdfs_broker/ && /bin/bash build.sh + build-cdc-client: + name: Build Cdc Client + needs: changes + if: ${{ needs.changes.outputs.cdc_client_changes == 'true' }} + runs-on: ubuntu-latest + steps: + - name: Checkout ${{ github.ref }} + uses: actions/checkout@v3 + + - name: Setup java + uses: actions/setup-java@v2 + with: + distribution: adopt + java-version: '17' + + - name: Setup thrift + run: | + pushd thirdparty + branch="${{ github.base_ref }}" + if [[ -z "${branch}" ]] || [[ "${branch}" == 'master' || "${branch}" == 'branch-4.0' || "${branch}" == 'branch-3.0' || "${branch}" == 'branch-2.1' ]]; then + curl -L https://github.com/apache/doris-thirdparty/releases/download/automation/doris-thirdparty-prebuilt-linux-x86_64.tar.xz \ + -o doris-thirdparty-prebuilt-linux-x86_64.tar.xz + else + curl -L "https://github.com/apache/doris-thirdparty/releases/download/automation-${branch/branch-/}/doris-thirdparty-prebuilt-linux-x86_64.tar.xz" \ + -o doris-thirdparty-prebuilt-linux-x86_64.tar.xz + fi + tar -xvf doris-thirdparty-prebuilt-linux-x86_64.tar.xz + popd + export PATH="${DEFAULT_DIR}/ldb-toolchain/bin/:$(pwd)/thirdparty/installed/bin/:${PATH}" + + thrift --version + + - name: Build cdc client + run: | + cd fs_brokers/cdc_client/ && /bin/bash build.sh # build-docs: # name: Build Documents # needs: changes diff --git a/be/src/runtime/cdc_client_mgr.cpp b/be/src/runtime/cdc_client_mgr.cpp index e1c7ba976e7913..96864ee8c6dd47 100644 --- a/be/src/runtime/cdc_client_mgr.cpp +++ b/be/src/runtime/cdc_client_mgr.cpp @@ -18,12 +18,14 @@ #include "runtime/cdc_client_mgr.h" #include +#include #include #include #include #include #include #include +#include #include #ifndef __APPLE__ @@ -129,24 +131,25 @@ Status CdcClientMgr::start_cdc_client(PRequestCdcClientResult* result) { if (kill(exist_pid, 0) == 0) { // Process exists, verify it's actually our CDC client by health check std::string check_response; - auto check_st = check_cdc_client_health(1, 0, check_response); + auto check_st = check_cdc_client_health(3, 1, check_response); if (check_st.ok()) { // Process exists and responding, CDC client is running return Status::OK(); } else { // Process exists but CDC client not responding // Either it's a different process (PID reused) or CDC client is unhealthy - // Reset PID and return error - _child_pid.store(0); st = Status::InternalError(fmt::format("CDC client {} unresponsive", exist_pid)); st.to_protobuf(result->mutable_status()); return st; } } else { + LOG(INFO) << "CDC client is dead, pid=" << exist_pid; // Process is dead, reset PID and continue to start _child_pid.store(0); } #endif + } else { + LOG(INFO) << "CDC client has never been started"; } const char* doris_home = getenv("DORIS_HOME"); @@ -199,6 +202,17 @@ Status CdcClientMgr::start_cdc_client(PRequestCdcClientResult* result) { #ifndef __APPLE__ prctl(PR_SET_PDEATHSIG, SIGKILL); #endif + // Redirect stdout and stderr to log out file + std::string cdc_out_file = std::string(log_dir) + "/cdc-client.out"; + int out_fd = open(cdc_out_file.c_str(), O_WRONLY | O_CREAT | O_APPEND | O_CLOEXEC, 0644); + if (out_fd < 0) { + perror("open cdc-client.out file failed"); + exit(1); + } + dup2(out_fd, STDOUT_FILENO); + dup2(out_fd, STDERR_FILENO); + close(out_fd); + // java -jar -Dlog.path=xx cdc-client.jar --server.port=9096 --backend.http.port=8040 execlp(java_bin.c_str(), "java", java_opts.c_str(), "-jar", cdc_jar_path.c_str(), cdc_jar_port.c_str(), backend_http_port.c_str(), (char*)NULL); diff --git a/build.sh b/build.sh index ef848701565051..c3851b7c7d6afa 100755 --- a/build.sh +++ b/build.sh @@ -55,6 +55,7 @@ Usage: $0 --broker build Broker. Default ON. --hive-udf build Hive UDF library for Ingestion Load. Default ON. --be-java-extensions build Backend java extensions. Default ON. + --be-cdc-client build Cdc Client for backend. Default ON. --be-extension-ignore build be-java-extensions package, choose which modules to ignore. Multiple modules separated by commas. --clean clean and build target --output specify the output directory diff --git a/docker/thirdparties/docker-compose/postgresql/init/01-create-schema.sql b/docker/thirdparties/docker-compose/postgresql/init/01-create-schema.sql index 7df7a6ff5e2665..3e3c0c05901cbc 100644 --- a/docker/thirdparties/docker-compose/postgresql/init/01-create-schema.sql +++ b/docker/thirdparties/docker-compose/postgresql/init/01-create-schema.sql @@ -17,3 +17,4 @@ create schema doris_test; create schema catalog_pg_test; +create schema cdc_test; diff --git a/docker/thirdparties/docker-compose/postgresql/postgresql-14.yaml.tpl b/docker/thirdparties/docker-compose/postgresql/postgresql-14.yaml.tpl index 65aab34b61c657..6d8873817a5a8a 100644 --- a/docker/thirdparties/docker-compose/postgresql/postgresql-14.yaml.tpl +++ b/docker/thirdparties/docker-compose/postgresql/postgresql-14.yaml.tpl @@ -25,6 +25,14 @@ services: POSTGRES_PASSWORD: 123456 ports: - ${DOCKER_PG_14_EXTERNAL_PORT}:5432 + command: + - "postgres" + - "-c" + - "wal_level=logical" + - "-c" + - "max_wal_senders=30" + - "-c" + - "max_replication_slots=30" healthcheck: test: [ "CMD-SHELL", "pg_isready -U postgres && psql -U postgres -c 'SELECT 1 FROM doris_test.deadline;'" ] interval: 5s diff --git a/fe/fe-common/src/main/java/org/apache/doris/job/cdc/DataSourceConfigKeys.java b/fe/fe-common/src/main/java/org/apache/doris/job/cdc/DataSourceConfigKeys.java index 074c100579c154..9e1918e561431b 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/job/cdc/DataSourceConfigKeys.java +++ b/fe/fe-common/src/main/java/org/apache/doris/job/cdc/DataSourceConfigKeys.java @@ -24,6 +24,7 @@ public class DataSourceConfigKeys { public static final String USER = "user"; public static final String PASSWORD = "password"; public static final String DATABASE = "database"; + public static final String SCHEMA = "schema"; public static final String INCLUDE_TABLES = "include_tables"; public static final String EXCLUDE_TABLES = "exclude_tables"; // initial,earliest,latest,{binlog,postion},\d{13} diff --git a/fe/fe-core/src/main/java/org/apache/doris/job/common/DataSourceType.java b/fe/fe-core/src/main/java/org/apache/doris/job/common/DataSourceType.java index 4ba5670fbd3410..b188f265957530 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/job/common/DataSourceType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/job/common/DataSourceType.java @@ -18,5 +18,6 @@ package org.apache.doris.job.common; public enum DataSourceType { - MYSQL + MYSQL, + POSTGRES } diff --git a/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/DataSourceConfigValidator.java b/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/DataSourceConfigValidator.java index f8850dd9d70f19..fcabcb82898d75 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/DataSourceConfigValidator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/DataSourceConfigValidator.java @@ -34,8 +34,10 @@ public class DataSourceConfigValidator { DataSourceConfigKeys.DRIVER_URL, DataSourceConfigKeys.DRIVER_CLASS, DataSourceConfigKeys.DATABASE, + DataSourceConfigKeys.SCHEMA, DataSourceConfigKeys.INCLUDE_TABLES, - DataSourceConfigKeys.EXCLUDE_TABLES + DataSourceConfigKeys.EXCLUDE_TABLES, + DataSourceConfigKeys.SPLIT_SIZE ); public static void validateSource(Map input) throws IllegalArgumentException { diff --git a/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/StreamingInsertJob.java b/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/StreamingInsertJob.java index 16c2354cc7dc68..011894f6287447 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/StreamingInsertJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/StreamingInsertJob.java @@ -26,7 +26,6 @@ import org.apache.doris.common.Config; import org.apache.doris.common.ErrorCode; import org.apache.doris.common.ErrorReport; -import org.apache.doris.common.FeConstants; import org.apache.doris.common.InternalErrorCode; import org.apache.doris.common.Pair; import org.apache.doris.common.UserException; @@ -210,6 +209,11 @@ private void initSourceJob() { init(); checkRequiredSourceProperties(); List createTbls = createTableIfNotExists(); + if (sourceProperties.get(DataSourceConfigKeys.INCLUDE_TABLES) == null) { + // cdc need the final includeTables + String includeTables = String.join(",", createTbls); + sourceProperties.put(DataSourceConfigKeys.INCLUDE_TABLES, includeTables); + } this.offsetProvider = new JdbcSourceOffsetProvider(getJobId(), dataSourceType, sourceProperties); JdbcSourceOffsetProvider rdsOffsetProvider = (JdbcSourceOffsetProvider) this.offsetProvider; rdsOffsetProvider.splitChunks(createTbls); @@ -232,9 +236,6 @@ private void checkRequiredSourceProperties() { "password is required property"); Preconditions.checkArgument(sourceProperties.get(DataSourceConfigKeys.DATABASE) != null, "database is required property"); - Preconditions.checkArgument(sourceProperties.get(DataSourceConfigKeys.INCLUDE_TABLES) != null - || sourceProperties.get(DataSourceConfigKeys.EXCLUDE_TABLES) != null, - "Either include_tables or exclude_tables must be specified"); if (!sourceProperties.containsKey(DataSourceConfigKeys.OFFSET)) { sourceProperties.put(DataSourceConfigKeys.OFFSET, DataSourceConfigKeys.OFFSET_LATEST); } @@ -515,6 +516,7 @@ protected void fetchMeta() { offsetProvider.fetchRemoteMeta(new HashMap<>()); } } catch (Exception ex) { + //todo: The job status = MANUAL_PAUSE_ERR, No need to set failureReason again log.warn("fetch remote meta failed, job id: {}", getJobId(), ex); failureReason = new FailureReason(InternalErrorCode.GET_REMOTE_DATA_ERROR, "Failed to fetch meta, " + ex.getMessage()); @@ -572,6 +574,7 @@ public void onStreamTaskFail(AbstractStreamingTask task) throws JobException { public void onStreamTaskSuccess(AbstractStreamingTask task) { try { + resetFailureInfo(null); succeedTaskCount.incrementAndGet(); Env.getCurrentEnv().getJobManager().getStreamingTaskManager().removeRunningTask(task); AbstractStreamingTask nextTask = createStreamingTask(); @@ -729,7 +732,7 @@ public TRow getTvfInfo() { trow.addToColumnValue(new TCell().setStringVal(getJobName())); trow.addToColumnValue(new TCell().setStringVal(getCreateUser().getQualifiedUser())); trow.addToColumnValue(new TCell().setStringVal(getJobConfig().getExecuteType().name())); - trow.addToColumnValue(new TCell().setStringVal(FeConstants.null_string)); + trow.addToColumnValue(new TCell().setStringVal("")); trow.addToColumnValue(new TCell().setStringVal(getJobStatus().name())); trow.addToColumnValue(new TCell().setStringVal(getShowSQL())); trow.addToColumnValue(new TCell().setStringVal(TimeUtils.longToTimeString(getCreateTimeMs()))); @@ -738,30 +741,30 @@ public TRow getTvfInfo() { trow.addToColumnValue(new TCell().setStringVal(String.valueOf(getCanceledTaskCount().get()))); trow.addToColumnValue(new TCell().setStringVal(getComment())); trow.addToColumnValue(new TCell().setStringVal(properties != null && !properties.isEmpty() - ? GsonUtils.GSON.toJson(properties) : FeConstants.null_string)); + ? GsonUtils.GSON.toJson(properties) : "")); if (offsetProvider != null && StringUtils.isNotEmpty(offsetProvider.getShowCurrentOffset())) { trow.addToColumnValue(new TCell().setStringVal(offsetProvider.getShowCurrentOffset())); } else { - trow.addToColumnValue(new TCell().setStringVal(FeConstants.null_string)); + trow.addToColumnValue(new TCell().setStringVal("")); } if (offsetProvider != null && StringUtils.isNotEmpty(offsetProvider.getShowMaxOffset())) { trow.addToColumnValue(new TCell().setStringVal(offsetProvider.getShowMaxOffset())); } else { - trow.addToColumnValue(new TCell().setStringVal(FeConstants.null_string)); + trow.addToColumnValue(new TCell().setStringVal("")); } if (tvfType != null) { trow.addToColumnValue(new TCell().setStringVal( - jobStatistic == null ? FeConstants.null_string : jobStatistic.toJson())); + jobStatistic == null ? "" : jobStatistic.toJson())); } else { trow.addToColumnValue(new TCell().setStringVal( - nonTxnJobStatistic == null ? FeConstants.null_string : nonTxnJobStatistic.toJson())); + nonTxnJobStatistic == null ? "" : nonTxnJobStatistic.toJson())); } trow.addToColumnValue(new TCell().setStringVal(failureReason == null - ? FeConstants.null_string : failureReason.getMsg())); + ? "" : failureReason.getMsg())); trow.addToColumnValue(new TCell().setStringVal(jobRuntimeMsg == null - ? FeConstants.null_string : jobRuntimeMsg)); + ? "" : jobRuntimeMsg)); return trow; } @@ -1064,7 +1067,7 @@ public void gsonPostProcess() throws IOException { * The current streamingTask times out; create a new streamingTask. * Only applies to StreamingMultiTask. */ - public void processTimeoutTasks() { + public void processTimeoutTasks() throws JobException { if (!(runningStreamTask instanceof StreamingMultiTblTask)) { return; } @@ -1073,16 +1076,8 @@ public void processTimeoutTasks() { StreamingMultiTblTask runningMultiTask = (StreamingMultiTblTask) this.runningStreamTask; if (TaskStatus.RUNNING.equals(runningMultiTask.getStatus()) && runningMultiTask.isTimeout()) { - runningMultiTask.cancel(false); - runningMultiTask.setErrMsg("task cancelled cause timeout"); - - // renew streaming multi task - this.runningStreamTask = createStreamingMultiTblTask(); - Env.getCurrentEnv().getJobManager().getStreamingTaskManager().registerTask(runningStreamTask); - this.runningStreamTask.setStatus(TaskStatus.PENDING); - log.info("create new streaming multi tasks due to timeout, for job {}, task {} ", - getJobId(), runningStreamTask.getTaskId()); - recordTasks(runningStreamTask); + runningMultiTask.onFail("task failed cause timeout"); + // renew streaming task by auto resume } } finally { writeUnlock(); @@ -1096,20 +1091,22 @@ public void commitOffset(CommitOffsetRequest offsetRequest) throws JobException } writeLock(); try { - if (offsetRequest.getScannedRows() == 0 && offsetRequest.getScannedBytes() == 0) { - JdbcSourceOffsetProvider op = (JdbcSourceOffsetProvider) offsetProvider; - op.setHasMoreData(false); - } - updateNoTxnJobStatisticAndOffset(offsetRequest); if (this.runningStreamTask != null && this.runningStreamTask instanceof StreamingMultiTblTask) { if (this.runningStreamTask.getTaskId() != offsetRequest.getTaskId()) { throw new JobException("Task id mismatch when commit offset. expected: " + this.runningStreamTask.getTaskId() + ", actual: " + offsetRequest.getTaskId()); } + updateNoTxnJobStatisticAndOffset(offsetRequest); + if (offsetRequest.getScannedRows() == 0 && offsetRequest.getScannedBytes() == 0) { + JdbcSourceOffsetProvider op = (JdbcSourceOffsetProvider) offsetProvider; + op.setHasMoreData(false); + } + persistOffsetProviderIfNeed(); ((StreamingMultiTblTask) this.runningStreamTask).successCallback(offsetRequest); } + } finally { writeUnlock(); } @@ -1134,6 +1131,7 @@ public void replayOffsetProviderIfNeed() throws JobException { * 2. Clean chunk info in meta table (jdbc) */ public void cleanup() throws JobException { + log.info("cleanup streaming job {}", getJobId()); // s3 tvf clean offset if (tvfType != null && Config.isCloudMode()) { Cloud.DeleteStreamingJobResponse resp = null; diff --git a/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/StreamingJobSchedulerTask.java b/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/StreamingJobSchedulerTask.java index d0034de4b4a266..2d26db535fd172 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/StreamingJobSchedulerTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/StreamingJobSchedulerTask.java @@ -71,7 +71,7 @@ private void handlePendingState() throws JobException { streamingInsertJob.setAutoResumeCount(0); } - private void handleRunningState() { + private void handleRunningState() throws JobException { streamingInsertJob.processTimeoutTasks(); streamingInsertJob.fetchMeta(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/StreamingMultiTblTask.java b/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/StreamingMultiTblTask.java index 7c610b406886e9..50bb0fd2acd6f1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/StreamingMultiTblTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/job/extensions/insert/streaming/StreamingMultiTblTask.java @@ -125,9 +125,9 @@ private void sendWriteRequest() throws JobException { result = future.get(); TStatusCode code = TStatusCode.findByValue(result.getStatus().getStatusCode()); if (code != TStatusCode.OK) { - log.error("Failed to get split from backend, {}", result.getStatus().getErrorMsgs(0)); + log.error("Failed to send write records request, {}", result.getStatus().getErrorMsgs(0)); throw new JobException( - "Failed to get split from backend," + result.getStatus().getErrorMsgs(0) + ", response: " + "Failed to send write records request," + result.getStatus().getErrorMsgs(0) + ", response: " + result.getResponse()); } String response = result.getResponse(); @@ -142,7 +142,7 @@ private void sendWriteRequest() throws JobException { return; } } catch (JsonProcessingException e) { - log.error("Failed to parse write records response: {}", response, e); + log.warn("Failed to parse write records response: {}", response); throw new JobException("Failed to parse write records response: " + response); } throw new JobException("Failed to send write records request , error message: " + response); @@ -257,7 +257,11 @@ public void closeOrReleaseResources() { } public boolean isTimeout() { - return (System.currentTimeMillis() - createTimeMs) > timeoutMs; + if (startTimeMs == null) { + // It's still pending, waiting for scheduling. + return false; + } + return (System.currentTimeMillis() - startTimeMs) > timeoutMs; } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/job/offset/jdbc/JdbcOffset.java b/fe/fe-core/src/main/java/org/apache/doris/job/offset/jdbc/JdbcOffset.java index 83ad4314d8b85b..ba83dd1c8cd1f5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/job/offset/jdbc/JdbcOffset.java +++ b/fe/fe-core/src/main/java/org/apache/doris/job/offset/jdbc/JdbcOffset.java @@ -74,4 +74,12 @@ public String showRange() { return new Gson().toJson(showMap); } } + + @Override + public String toString() { + return "JdbcOffset{" + + "split=" + + split + + '}'; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/job/offset/jdbc/JdbcSourceOffsetProvider.java b/fe/fe-core/src/main/java/org/apache/doris/job/offset/jdbc/JdbcSourceOffsetProvider.java index d7195abac0cee4..2c898b04a07c37 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/job/offset/jdbc/JdbcSourceOffsetProvider.java +++ b/fe/fe-core/src/main/java/org/apache/doris/job/offset/jdbc/JdbcSourceOffsetProvider.java @@ -83,7 +83,7 @@ public class JdbcSourceOffsetProvider implements SourceOffsetProvider { @SerializedName("bop") Map binlogOffsetPersist; - boolean hasMoreData = true; + volatile boolean hasMoreData = true; public JdbcSourceOffsetProvider(Long jobId, DataSourceType sourceType, Map sourceProperties) { this.jobId = jobId; @@ -157,7 +157,7 @@ public void updateOffset(Offset offset) { if (split.snapshotSplit()) { SnapshotSplit snapshotSplit = (SnapshotSplit) split; String splitId = split.getSplitId(); - remainingSplits.removeIf(v -> { + boolean remove = remainingSplits.removeIf(v -> { if (v.getSplitId().equals(splitId)) { snapshotSplit.setTableId(v.getTableId()); snapshotSplit.setSplitKey(v.getSplitKey()); @@ -167,9 +167,13 @@ public void updateOffset(Offset offset) { } return false; }); - finishedSplits.add(snapshotSplit); - chunkHighWatermarkMap.computeIfAbsent(snapshotSplit.getTableId(), k -> new HashMap<>()) - .put(snapshotSplit.getSplitId(), snapshotSplit.getHighWatermark()); + if (remove) { + finishedSplits.add(snapshotSplit); + chunkHighWatermarkMap.computeIfAbsent(snapshotSplit.getTableId(), k -> new HashMap<>()) + .put(snapshotSplit.getSplitId(), snapshotSplit.getHighWatermark()); + } else { + log.warn("Cannot find snapshot split {} in remainingSplits for job {}", splitId, getJobId()); + } } else { BinlogSplit binlogSplit = (BinlogSplit) split; binlogOffsetPersist = new HashMap<>(binlogSplit.getStartingOffset()); @@ -192,7 +196,7 @@ public void fetchRemoteMeta(Map properties) throws Exception { result = future.get(); TStatusCode code = TStatusCode.findByValue(result.getStatus().getStatusCode()); if (code != TStatusCode.OK) { - log.error("Failed to get end offset from backend, {}", result.getStatus().getErrorMsgs(0)); + log.warn("Failed to get end offset from backend, {}", result.getStatus().getErrorMsgs(0)); throw new JobException( "Failed to get end offset from backend," + result.getStatus().getErrorMsgs(0) + ", response: " + result.getResponse()); @@ -210,11 +214,11 @@ public void fetchRemoteMeta(Map properties) throws Exception { } endBinlogOffset = responseObj.getData(); } catch (JsonProcessingException e) { - log.error("Failed to parse end offset response: {}", response, e); + log.warn("Failed to parse end offset response: {}", response); throw new JobException(response); } } catch (ExecutionException | InterruptedException ex) { - log.error("Get end offset error: ", ex); + log.warn("Get end offset error: ", ex); throw new JobException(ex); } } @@ -268,7 +272,7 @@ private boolean compareOffset(Map offsetFirst, Map offsetFirst, Map 0; } catch (JsonProcessingException e) { - log.error("Failed to parse compare offset response: {}", response, e); + log.warn("Failed to parse compare offset response: {}", response); throw new JobException("Failed to parse compare offset response: " + response); } } catch (ExecutionException | InterruptedException ex) { - log.error("Compare offset error: ", ex); + log.warn("Compare offset error: ", ex); throw new JobException(ex); } } @@ -454,7 +458,7 @@ private List requestTableSplits(String table) throws JobException result = future.get(); TStatusCode code = TStatusCode.findByValue(result.getStatus().getStatusCode()); if (code != TStatusCode.OK) { - log.error("Failed to get split from backend, {}", result.getStatus().getErrorMsgs(0)); + log.warn("Failed to get split from backend, {}", result.getStatus().getErrorMsgs(0)); throw new JobException( "Failed to get split from backend," + result.getStatus().getErrorMsgs(0) + ", response: " + result.getResponse()); @@ -469,11 +473,11 @@ private List requestTableSplits(String table) throws JobException List splits = responseObj.getData(); return splits; } catch (JsonProcessingException e) { - log.error("Failed to parse split response: {}", response, e); + log.warn("Failed to parse split response: {}", response); throw new JobException("Failed to parse split response: " + response); } } catch (ExecutionException | InterruptedException ex) { - log.error("Get splits error: ", ex); + log.warn("Get splits error: ", ex); throw new JobException(ex); } } @@ -486,9 +490,26 @@ private boolean checkNeedSplitChunks(Map sourceProperties) { return DataSourceConfigKeys.OFFSET_INITIAL.equalsIgnoreCase(startMode); } - public void cleanMeta(Long jobId) { + public void cleanMeta(Long jobId) throws JobException { // clean meta table StreamingJobUtils.deleteJobMeta(jobId); - // todo: close cdc client source + Backend backend = StreamingJobUtils.selectBackend(jobId); + JobBaseConfig requestParams = new JobBaseConfig(getJobId(), sourceType.name(), sourceProperties); + InternalService.PRequestCdcClientRequest request = InternalService.PRequestCdcClientRequest.newBuilder() + .setApi("/api/close") + .setParams(new Gson().toJson(requestParams)).build(); + TNetworkAddress address = new TNetworkAddress(backend.getHost(), backend.getBrpcPort()); + InternalService.PRequestCdcClientResult result = null; + try { + Future future = + BackendServiceProxy.getInstance().requestCdcClient(address, request); + result = future.get(); + TStatusCode code = TStatusCode.findByValue(result.getStatus().getStatusCode()); + if (code != TStatusCode.OK) { + log.warn("Failed to close job {} source {}", jobId, result.getStatus().getErrorMsgs(0)); + } + } catch (ExecutionException | InterruptedException ex) { + log.warn("Close job error: ", ex); + } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/job/util/StreamingJobUtils.java b/fe/fe-core/src/main/java/org/apache/doris/job/util/StreamingJobUtils.java index 436ef0aea35991..0281503448cd7d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/job/util/StreamingJobUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/job/util/StreamingJobUtils.java @@ -22,12 +22,13 @@ import org.apache.doris.catalog.Database; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.KeysType; +import org.apache.doris.catalog.PrimitiveType; +import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.Table; import org.apache.doris.common.FeConstants; import org.apache.doris.datasource.InternalCatalog; import org.apache.doris.datasource.jdbc.client.JdbcClient; import org.apache.doris.datasource.jdbc.client.JdbcClientConfig; -import org.apache.doris.datasource.jdbc.client.JdbcMySQLClient; import org.apache.doris.job.cdc.DataSourceConfigKeys; import org.apache.doris.job.cdc.split.SnapshotSplit; import org.apache.doris.job.common.DataSourceType; @@ -48,9 +49,11 @@ import org.apache.doris.thrift.TUniqueId; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import lombok.extern.log4j.Log4j2; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.text.StringSubstitutor; import java.io.IOException; @@ -199,8 +202,7 @@ private static ConnectContext buildConnectContext() { return ctx; } - private static JdbcClient getJdbcClient(DataSourceType sourceType, Map properties) - throws JobException { + private static JdbcClient getJdbcClient(DataSourceType sourceType, Map properties) { JdbcClientConfig config = new JdbcClientConfig(); config.setCatalog(sourceType.name()); config.setUser(properties.get(DataSourceConfigKeys.USER)); @@ -208,13 +210,7 @@ private static JdbcClient getJdbcClient(DataSourceType sourceType, Map generateCreateTableCmds(String targetDb, if (includeTables != null) { includeTablesList = Arrays.asList(includeTables.split(",")); } + List excludeTablesList = new ArrayList<>(); + if (excludeTables != null) { + excludeTablesList = Arrays.asList(excludeTables.split(",")); + } - String database = properties.get(DataSourceConfigKeys.DATABASE); JdbcClient jdbcClient = getJdbcClient(sourceType, properties); + String database = getRemoteDbName(sourceType, properties); List tablesNameList = jdbcClient.getTablesNameList(database); if (tablesNameList.isEmpty()) { throw new JobException("No tables found in database " + database); } Map tableCreateProperties = getTableCreateProperties(targetProperties); + List noPrimaryKeyTables = new ArrayList<>(); for (String table : tablesNameList) { if (!includeTablesList.isEmpty() && !includeTablesList.contains(table)) { @@ -264,14 +265,16 @@ public static List generateCreateTableCmds(String targetDb, continue; } - if (excludeTables != null && excludeTables.contains(table)) { + // if set include_tables, exclude_tables is ignored + if (includeTablesList.isEmpty() + && !excludeTablesList.isEmpty() && excludeTablesList.contains(table)) { log.info("Skip table {} in database {} as it in exclude_tables {}", table, database, excludeTables); continue; } - List columns = jdbcClient.getColumnsFromJdbc(database, table); List primaryKeys = jdbcClient.getPrimaryKeys(database, table); + List columns = getColumns(jdbcClient, database, table, primaryKeys); if (primaryKeys.isEmpty()) { noPrimaryKeyTables.add(table); } @@ -324,6 +327,43 @@ public static List generateCreateTableCmds(String targetDb, return createtblCmds; } + private static List getColumns(JdbcClient jdbcClient, + String database, + String table, + List primaryKeys) { + List columns = jdbcClient.getColumnsFromJdbc(database, table); + columns.forEach(col -> { + // string can not to be key + if (primaryKeys.contains(col.getName()) + && col.getDataType() == PrimitiveType.STRING) { + col.setType(ScalarType.createVarcharType(ScalarType.MAX_VARCHAR_LENGTH)); + } + }); + return columns; + } + + /** + * The remoteDB implementation differs for each data source; + * refer to the hierarchical mapping in the JDBC catalog. + */ + private static String getRemoteDbName(DataSourceType sourceType, Map properties) + throws JobException { + String remoteDb = null; + switch (sourceType) { + case MYSQL: + remoteDb = properties.get(DataSourceConfigKeys.DATABASE); + Preconditions.checkArgument(StringUtils.isNotEmpty(remoteDb), "database is required"); + break; + case POSTGRES: + remoteDb = properties.get(DataSourceConfigKeys.SCHEMA); + Preconditions.checkArgument(StringUtils.isNotEmpty(remoteDb), "schema is required"); + break; + default: + throw new JobException("Unsupported source type " + sourceType); + } + return remoteDb; + } + private static Map getTableCreateProperties(Map properties) { final Map tableCreateProps = new HashMap<>(); for (Map.Entry entry : properties.entrySet()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilderForEncryption.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilderForEncryption.java index 740766263815a1..3c9eea1d386e89 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilderForEncryption.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilderForEncryption.java @@ -22,6 +22,7 @@ import org.apache.doris.common.util.PrintableMap; import org.apache.doris.nereids.DorisParser; import org.apache.doris.nereids.DorisParser.InsertTableContext; +import org.apache.doris.nereids.DorisParser.JobFromToClauseContext; import org.apache.doris.nereids.DorisParser.SupportedDmlStatementContext; import org.apache.doris.nereids.trees.plans.commands.info.SetVarOp; import org.apache.doris.nereids.trees.plans.logical.LogicalPlan; @@ -157,8 +158,16 @@ public LogicalPlan visitTableValuedFunction(DorisParser.TableValuedFunctionConte // create job select tvf @Override public LogicalPlan visitCreateScheduledJob(DorisParser.CreateScheduledJobContext ctx) { - SupportedDmlStatementContext supportedDmlStatementContext = ctx.supportedDmlStatement(); - visitInsertTable((InsertTableContext) supportedDmlStatementContext); + if (ctx.supportedDmlStatement() != null) { + SupportedDmlStatementContext supportedDmlStatementContext = ctx.supportedDmlStatement(); + visitInsertTable((InsertTableContext) supportedDmlStatementContext); + } else if (ctx.jobFromToClause() != null) { + JobFromToClauseContext jobFromToClauseContext = ctx.jobFromToClause(); + encryptProperty(visitPropertyItemList(jobFromToClauseContext.sourceProperties), + jobFromToClauseContext.sourceProperties.start.getStartIndex(), + jobFromToClauseContext.sourceProperties.stop.getStopIndex()); + + } return super.visitCreateScheduledJob(ctx); } @@ -168,6 +177,12 @@ public LogicalPlan visitAlterJob(DorisParser.AlterJobContext ctx) { SupportedDmlStatementContext supportedDmlStatementContext = ctx.supportedDmlStatement(); if (ctx.supportedDmlStatement() != null) { visitInsertTable((InsertTableContext) supportedDmlStatementContext); + } else if (ctx.jobFromToClause() != null) { + JobFromToClauseContext jobFromToClauseContext = ctx.jobFromToClause(); + encryptProperty(visitPropertyItemList(jobFromToClauseContext.sourceProperties), + jobFromToClauseContext.sourceProperties.start.getStartIndex(), + jobFromToClauseContext.sourceProperties.stop.getStopIndex()); + } return super.visitAlterJob(ctx); } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/EncryptSQLTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/EncryptSQLTest.java index 84476cb4614558..c2a39e366e3738 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/EncryptSQLTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/EncryptSQLTest.java @@ -271,7 +271,7 @@ public boolean isForwardToMaster() { res = "SET PASSWORD FOR 'admin' = PASSWORD('*XXX')"; parseAndCheck(sql, res); - // create job + // create s3 job sql = "CREATE JOB my_job" + " ON STREAMING" + " DO" @@ -303,7 +303,7 @@ public boolean isForwardToMaster() { + " );"; parseAndCheck(sql, res); - // alter job + // alter s3 job sql = "ALTER JOB my_job" + " INSERT INTO test.`student`" + " SELECT * FROM S3" @@ -331,6 +331,68 @@ public boolean isForwardToMaster() { + " );"; parseAndCheck(sql, res); + // create mysql job + sql = "CREATE JOB my_mysql_job " + + "ON STREAMING " + + "FROM MYSQL ( " + + "\"jdbc_url\" = \"jdbc:mysql://127.0.0.1:3306\", " + + "\"driver_url\" = \"mysql-connector-j-8.4.0.jar\", " + + "\"driver_class\" = \"com.mysql.cj.jdbc.Driver\", " + + "\"user\" = \"root\", " + + "\"password\" = \"123456\", " + + "\"database\" = \"test_cdc_db\", " + + "\"include_tables\" = \"mysqltable\", " + + "\"offset\" = \"initial\" " + + ") " + + "TO DATABASE targetDB ( " + + "\"table.create.properties.replication_num\" = \"1\" " + + ")"; + + res = "CREATE JOB my_mysql_job " + + "ON STREAMING " + + "FROM MYSQL ( " + + "\"jdbc_url\" = \"jdbc:mysql://127.0.0.1:3306\", " + + "\"driver_url\" = \"mysql-connector-j-8.4.0.jar\", " + + "\"driver_class\" = \"com.mysql.cj.jdbc.Driver\", " + + "\"user\" = \"root\", " + + "\"password\" = \"*XXX\", " + + "\"database\" = \"test_cdc_db\", " + + "\"include_tables\" = \"mysqltable\", " + + "\"offset\" = \"initial\" " + + ") " + + "TO DATABASE targetDB ( " + + "\"table.create.properties.replication_num\" = \"1\" " + + ")"; + parseAndCheck(sql, res); + + // alter mysql job + sql = "ALTER JOB my_mysql_job " + + "FROM MYSQL ( " + + "\"jdbc_url\" = \"jdbc:mysql://127.0.0.1:3306\", " + + "\"driver_url\" = \"mysql-connector-j-8.4.0.jar\", " + + "\"driver_class\" = \"com.mysql.cj.jdbc.Driver\", " + + "\"user\" = \"mysql_job_priv\", " + + "\"password\" = \"test123\", " + + "\"database\" = \"test_cdc_db\", " + + "\"include_tables\" = \"mysqltable\", " + + "\"offset\" = \"latest\"" + + ")" + + "TO DATABASE targetDB"; + + res = "ALTER JOB my_mysql_job " + + "FROM MYSQL ( " + + "\"jdbc_url\" = \"jdbc:mysql://127.0.0.1:3306\", " + + "\"driver_url\" = \"mysql-connector-j-8.4.0.jar\", " + + "\"driver_class\" = \"com.mysql.cj.jdbc.Driver\", " + + "\"user\" = \"mysql_job_priv\", " + + "\"password\" = \"*XXX\", " + + "\"database\" = \"test_cdc_db\", " + + "\"include_tables\" = \"mysqltable\", " + + "\"offset\" = \"latest\"" + + ")" + + "TO DATABASE targetDB"; + parseAndCheck(sql, res); + sql = "selected * from tbl"; res = "Syntax Error"; parseAndCheck(sql, res); diff --git a/fs_brokers/cdc_client/build.sh b/fs_brokers/cdc_client/build.sh index 52d9683baf0ee3..6f2e325b3ecafb 100755 --- a/fs_brokers/cdc_client/build.sh +++ b/fs_brokers/cdc_client/build.sh @@ -25,10 +25,10 @@ export DORIS_HOME="${ROOT}/../.." export CDC_CLIENT_HOME="${ROOT}" +"${DORIS_HOME}"/generated-source.sh noclean cd "${DORIS_HOME}/fe" "${MVN_CMD}" install -pl fe-common -Dskip.doc=true -DskipTests - echo "Install cdc client..." cd "${CDC_CLIENT_HOME}" "${MVN_CMD}" package -DskipTests diff --git a/fs_brokers/cdc_client/pom.xml b/fs_brokers/cdc_client/pom.xml index 1f5723f9b9cacf..c2e3580be76013 100644 --- a/fs_brokers/cdc_client/pom.xml +++ b/fs_brokers/cdc_client/pom.xml @@ -120,6 +120,11 @@ under the License. flink-connector-mysql-cdc 3.5.0 + + org.apache.flink + flink-connector-postgres-cdc + 3.5.0 + org.apache.flink flink-clients diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/common/Constants.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/common/Constants.java index f7a49bdaa08c01..2afacf6b9e7cdd 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/common/Constants.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/common/Constants.java @@ -19,4 +19,6 @@ public class Constants { public static final String DORIS_DELETE_SIGN = "__DORIS_DELETE_SIGN__"; + public static final long DEBEZIUM_HEARTBEAT_INTERVAL_MS = 10000L; + public static final long POLL_SPLIT_RECORDS_TIMEOUTS = 15000L; } diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/common/Env.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/common/Env.java index a37222158650b2..ff4056a8b5b3b1 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/common/Env.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/common/Env.java @@ -26,16 +26,24 @@ import java.util.Map; import java.util.Objects; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; +import com.google.common.base.Preconditions; import lombok.Setter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class Env { + private static final Logger LOG = LoggerFactory.getLogger(Env.class); private static volatile Env INSTANCE; private final Map jobContexts; + private final Map jobLocks; @Setter private int backendHttpPort; private Env() { this.jobContexts = new ConcurrentHashMap<>(); + this.jobLocks = new ConcurrentHashMap<>(); } public String getBackendHostPort() { @@ -72,22 +80,46 @@ private DataSource resolveDataSource(String source) { private SourceReader getOrCreateReader( Long jobId, DataSource dataSource, Map config) { - JobContext context = getOrCreateContext(jobId, dataSource, config); - return context.getOrCreateReader(dataSource); - } - - public void close(Long jobId) { - JobContext context = jobContexts.remove(jobId); + Objects.requireNonNull(jobId, "jobId"); + Objects.requireNonNull(dataSource, "dataSource"); + JobContext context = jobContexts.get(jobId); if (context != null) { - context.close(); + return context.getReader(dataSource); + } + + Lock lock = jobLocks.computeIfAbsent(jobId, k -> new ReentrantLock()); + lock.lock(); + try { + // double check + context = jobContexts.get(jobId); + if (context != null) { + return context.getReader(dataSource); + } + + LOG.info("Creating new reader for job {}, dataSource {}", jobId, dataSource); + context = new JobContext(jobId, dataSource, config); + SourceReader reader = context.initializeReader(); + jobContexts.put(jobId, context); + return reader; + } finally { + lock.unlock(); } } - private JobContext getOrCreateContext( - Long jobId, DataSource dataSource, Map config) { - Objects.requireNonNull(jobId, "jobId"); - Objects.requireNonNull(dataSource, "dataSource"); - return jobContexts.computeIfAbsent(jobId, id -> new JobContext(id, dataSource, config)); + public void close(Long jobId) { + Lock lock = jobLocks.get(jobId); + if (lock != null) { + lock.lock(); + try { + jobContexts.remove(jobId); + jobLocks.remove(jobId); + } finally { + lock.unlock(); + } + } else { + // should not happen + jobContexts.remove(jobId); + } } private static final class JobContext { @@ -102,25 +134,22 @@ private JobContext(long jobId, DataSource dataSource, Map config this.config = config; } - private synchronized SourceReader getOrCreateReader(DataSource source) { - if (reader == null) { - reader = SourceReaderFactory.createSourceReader(source); - reader.initialize(config); - dataSource = source; - } else if (dataSource != source) { + private SourceReader initializeReader() { + SourceReader newReader = SourceReaderFactory.createSourceReader(dataSource); + newReader.initialize(jobId, dataSource, config); + this.reader = newReader; + return reader; + } + + private SourceReader getReader(DataSource source) { + if (this.dataSource != source) { throw new IllegalStateException( String.format( "Job %d already bound to datasource %s, cannot switch to %s", - jobId, dataSource, source)); + jobId, this.dataSource, source)); } + Preconditions.checkState(reader != null, "Job %d reader not initialized yet", jobId); return reader; } - - private void close() { - if (reader != null) { - reader.close(jobId); - reader = null; - } - } } } diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/controller/ClientController.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/controller/ClientController.java index 916b461e72dedf..2f444260559003 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/controller/ClientController.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/controller/ClientController.java @@ -32,7 +32,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.RequestBody; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestMethod; @@ -61,8 +60,7 @@ public Object fetchSplits(@RequestBody FetchTableSplitsRequest ftsReq) { @RequestMapping(path = "/api/fetchRecords", method = RequestMethod.POST) public Object fetchRecords(@RequestBody FetchRecordRequest recordReq) { try { - SourceReader reader = Env.getCurrentEnv().getReader(recordReq); - return RestResponse.success(reader.read(recordReq)); + return RestResponse.success(pipelineCoordinator.fetchRecords(recordReq)); } catch (Exception ex) { LOG.error("Failed fetch record, jobId={}", recordReq.getJobId(), ex); return RestResponse.internalError(ex.getMessage()); @@ -84,6 +82,7 @@ public Object writeRecord(@RequestBody WriteRecordRequest recordReq) { /** Fetch lastest end meta */ @RequestMapping(path = "/api/fetchEndOffset", method = RequestMethod.POST) public Object fetchEndOffset(@RequestBody JobBaseConfig jobConfig) { + LOG.info("Fetching end offset for job {}", jobConfig.getJobId()); SourceReader reader = Env.getCurrentEnv().getReader(jobConfig); return RestResponse.success(reader.getEndOffset(jobConfig)); } @@ -96,11 +95,14 @@ public Object compareOffset(@RequestBody CompareOffsetRequest compareOffsetReque } /** Close job */ - @RequestMapping(path = "/api/close/{jobId}", method = RequestMethod.POST) - public Object close(@PathVariable long jobId) { + @RequestMapping(path = "/api/close", method = RequestMethod.POST) + public Object close(@RequestBody JobBaseConfig jobConfig) { + LOG.info("Closing job {}", jobConfig.getJobId()); Env env = Env.getCurrentEnv(); - env.close(jobId); - pipelineCoordinator.closeJob(jobId); + SourceReader reader = env.getReader(jobConfig); + reader.close(jobConfig); + env.close(jobConfig.getJobId()); + pipelineCoordinator.closeJobStreamLoad(jobConfig.getJobId()); return RestResponse.success(true); } } diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/model/rest/RestResponse.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/model/rest/RestResponse.java index 5126f138c2db48..3702a852dd7f25 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/model/rest/RestResponse.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/model/rest/RestResponse.java @@ -19,11 +19,13 @@ import java.io.Serializable; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import lombok.Data; import lombok.NoArgsConstructor; @Data @NoArgsConstructor +@JsonIgnoreProperties(ignoreUnknown = true) public class RestResponse implements Serializable { private static final long serialVersionUID = 1L; public static final int SUCCESS = 0; diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/service/PipelineCoordinator.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/service/PipelineCoordinator.java index 40d66e7fa1e5e9..591c4790e6ca42 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/service/PipelineCoordinator.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/service/PipelineCoordinator.java @@ -18,14 +18,17 @@ package org.apache.doris.cdcclient.service; import org.apache.doris.cdcclient.common.Env; -import org.apache.doris.cdcclient.exception.StreamLoadException; +import org.apache.doris.cdcclient.model.response.RecordWithMeta; import org.apache.doris.cdcclient.sink.DorisBatchStreamLoad; import org.apache.doris.cdcclient.source.reader.SourceReader; import org.apache.doris.cdcclient.source.reader.SplitReadResult; +import org.apache.doris.job.cdc.request.FetchRecordRequest; import org.apache.doris.job.cdc.request.WriteRecordRequest; +import org.apache.doris.job.cdc.split.BinlogSplit; +import org.apache.doris.job.cdc.split.SnapshotSplit; import org.apache.commons.collections.CollectionUtils; -import org.apache.flink.cdc.connectors.mysql.source.utils.RecordUtils; +import org.apache.flink.api.connector.source.SourceSplit; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.source.SourceRecord; @@ -39,6 +42,7 @@ import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Preconditions; import io.debezium.data.Envelope; import org.slf4j.Logger; @@ -55,6 +59,7 @@ public class PipelineCoordinator { private final ThreadPoolExecutor executor; private static final int MAX_CONCURRENT_TASKS = 10; private static final int QUEUE_CAPACITY = 128; + private static ObjectMapper objectMapper = new ObjectMapper(); public PipelineCoordinator() { this.executor = @@ -74,6 +79,82 @@ public PipelineCoordinator() { new ThreadPoolExecutor.AbortPolicy()); } + public RecordWithMeta fetchRecords(FetchRecordRequest fetchRecordRequest) throws Exception { + SourceReader sourceReader = Env.getCurrentEnv().getReader(fetchRecordRequest); + SplitReadResult readResult = sourceReader.readSplitRecords(fetchRecordRequest); + return buildRecordResponse(sourceReader, fetchRecordRequest, readResult); + } + + /** build RecordWithMeta */ + private RecordWithMeta buildRecordResponse( + SourceReader sourceReader, FetchRecordRequest fetchRecord, SplitReadResult readResult) + throws Exception { + RecordWithMeta recordResponse = new RecordWithMeta(); + SourceSplit split = readResult.getSplit(); + int count = 0; + try { + // Serialize records and add them to the response (collect from iterator) + Iterator iterator = readResult.getRecordIterator(); + while (iterator != null && iterator.hasNext()) { + SourceRecord element = iterator.next(); + List serializedRecords = + sourceReader.deserialize(fetchRecord.getConfig(), element); + if (!CollectionUtils.isEmpty(serializedRecords)) { + recordResponse.getRecords().addAll(serializedRecords); + count += serializedRecords.size(); + if (sourceReader.isBinlogSplit(split)) { + // put offset for event + Map lastMeta = + sourceReader.extractBinlogStateOffset(readResult.getSplitState()); + lastMeta.put(SPLIT_ID, BinlogSplit.BINLOG_SPLIT_ID); + recordResponse.setMeta(lastMeta); + } + if (count >= fetchRecord.getFetchSize()) { + return recordResponse; + } + } + } + } finally { + sourceReader.finishSplitRecords(); + } + + if (readResult.getSplitState() != null) { + // Set meta information for hw + if (sourceReader.isSnapshotSplit(split)) { + Map offsetRes = + sourceReader.extractSnapshotStateOffset(readResult.getSplitState()); + offsetRes.put(SPLIT_ID, split.splitId()); + recordResponse.setMeta(offsetRes); + return recordResponse; + } + // set meta for binlog event + if (sourceReader.isBinlogSplit(split)) { + Map offsetRes = + sourceReader.extractBinlogStateOffset(readResult.getSplitState()); + offsetRes.put(SPLIT_ID, BinlogSplit.BINLOG_SPLIT_ID); + } + } + + // no data in this split, set meta info + if (CollectionUtils.isEmpty(recordResponse.getRecords())) { + if (sourceReader.isBinlogSplit(split)) { + Map offsetRes = + sourceReader.extractBinlogOffset(readResult.getSplit()); + offsetRes.put(SPLIT_ID, BinlogSplit.BINLOG_SPLIT_ID); + recordResponse.setMeta(offsetRes); + } else { + SnapshotSplit snapshotSplit = + objectMapper.convertValue(fetchRecord.getMeta(), SnapshotSplit.class); + Map meta = new HashMap<>(); + meta.put(SPLIT_ID, snapshotSplit.getSplitId()); + // chunk no data + recordResponse.setMeta(meta); + } + } + sourceReader.commitSourceOffset(fetchRecord.getJobId(), readResult.getSplit()); + return recordResponse; + } + public CompletableFuture writeRecordsAsync(WriteRecordRequest writeRecordRequest) { Preconditions.checkNotNull(writeRecordRequest.getToken(), "token must not be null"); Preconditions.checkNotNull(writeRecordRequest.getTaskId(), "taskId must not be null"); @@ -91,7 +172,7 @@ public CompletableFuture writeRecordsAsync(WriteRecordRequest writeRecordR writeRecordRequest.getJobId(), writeRecordRequest.getTaskId()); } catch (Exception ex) { - closeJob(writeRecordRequest.getJobId()); + closeJobStreamLoad(writeRecordRequest.getJobId()); LOG.error( "Failed to process async write record, jobId={} taskId={}", writeRecordRequest.getJobId(), @@ -142,13 +223,13 @@ public void writeRecords(WriteRecordRequest writeRecordRequest) throws Exception batchStreamLoad.writeRecord(database, table, dataBytes); } - Map lastMeta = - RecordUtils.getBinlogPosition(element).getOffset(); - if (sourceReader.isBinlogSplit(readResult.getSplit()) - && readResult.getSplit() != null) { - lastMeta.put(SPLIT_ID, readResult.getSplit().splitId()); + if (sourceReader.isBinlogSplit(readResult.getSplit())) { + // put offset for event + Map lastMeta = + sourceReader.extractBinlogStateOffset(readResult.getSplitState()); + lastMeta.put(SPLIT_ID, BinlogSplit.BINLOG_SPLIT_ID); + metaResponse = lastMeta; } - metaResponse = lastMeta; } // Check if maxInterval has been exceeded long elapsedTime = System.currentTimeMillis() - startTime; @@ -170,6 +251,7 @@ public void writeRecords(WriteRecordRequest writeRecordRequest) throws Exception if (sourceReader.isBinlogSplit(readResult.getSplit())) { Map offsetRes = sourceReader.extractBinlogOffset(readResult.getSplit()); + offsetRes.put(SPLIT_ID, BinlogSplit.BINLOG_SPLIT_ID); batchStreamLoad.commitOffset(offsetRes, scannedRows, scannedBytes); return; } else { @@ -180,20 +262,17 @@ public void writeRecords(WriteRecordRequest writeRecordRequest) throws Exception // wait all stream load finish batchStreamLoad.forceFlush(); // update offset meta - if (!sourceReader.isBinlogSplit(readResult.getSplit())) { + if (sourceReader.isSnapshotSplit(readResult.getSplit())) { Map offsetRes = - sourceReader.extractSnapshotOffset( - readResult.getSplit(), readResult.getSplitState()); - if (offsetRes == null) { - // should not happen - throw new StreamLoadException( - "Chunk data cannot be obtained from highWatermark."); - } + sourceReader.extractSnapshotStateOffset(readResult.getSplitState()); + offsetRes.put(SPLIT_ID, readResult.getSplit().splitId()); metaResponse = offsetRes; } // request fe api batchStreamLoad.commitOffset(metaResponse, scannedRows, scannedBytes); + // commit source offset if need + sourceReader.commitSourceOffset(writeRecordRequest.getJobId(), readResult.getSplit()); } finally { batchStreamLoad.resetTaskId(); } @@ -208,7 +287,7 @@ private DorisBatchStreamLoad getOrCreateBatchStreamLoad(Long jobId, String targe }); } - public void closeJob(Long jobId) { + public void closeJobStreamLoad(Long jobId) { DorisBatchStreamLoad batchStreamLoad = batchStreamLoadMap.remove(jobId); if (batchStreamLoad != null) { LOG.info("Close DorisBatchStreamLoad for jobId={}", jobId); diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/sink/DorisBatchStreamLoad.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/sink/DorisBatchStreamLoad.java index ed100dde3830bf..bf6a4102801059 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/sink/DorisBatchStreamLoad.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/sink/DorisBatchStreamLoad.java @@ -504,6 +504,9 @@ public void commitOffset(Map meta, long scannedRows, long scanne LOG.info("commit result {}", responseBody); if (statusCode == 200) { LOG.info("commit offset for jobId {} taskId {}", jobId, currentTaskId); + // A 200 response indicates that the request was successful, and + // information such as offset and statistics may have already been + // updated. Retrying may result in repeated updates. return; } LOG.error( diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/deserialize/DebeziumJsonDeserializer.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/deserialize/DebeziumJsonDeserializer.java index 1bc63dc3f66b50..556c186b5d4a55 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/deserialize/DebeziumJsonDeserializer.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/deserialize/DebeziumJsonDeserializer.java @@ -43,15 +43,21 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import static org.apache.doris.cdcclient.common.Constants.DORIS_DELETE_SIGN; +import com.esri.core.geometry.ogc.OGCGeometry; import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import io.debezium.data.Bits; import io.debezium.data.Envelope; import io.debezium.data.SpecialValueDecimal; import io.debezium.data.VariableScaleDecimal; +import io.debezium.data.geometry.Geography; +import io.debezium.data.geometry.Geometry; +import io.debezium.data.geometry.Point; import io.debezium.time.MicroTime; import io.debezium.time.MicroTimestamp; import io.debezium.time.NanoTime; @@ -76,7 +82,7 @@ public DebeziumJsonDeserializer() {} @Override public void init(Map props) { this.serverTimeZone = - ConfigUtil.getServerTimeZone(props.get(DataSourceConfigKeys.JDBC_URL)); + ConfigUtil.getServerTimeZoneFromJdbcUrl(props.get(DataSourceConfigKeys.JDBC_URL)); } @Override @@ -201,6 +207,10 @@ private Object convert(Schema fieldSchema, Object dbzObj) { return convertDecimal(dbzObj, fieldSchema); case Bits.LOGICAL_NAME: return dbzObj; + case Point.LOGICAL_NAME: + case Geometry.LOGICAL_NAME: + case Geography.LOGICAL_NAME: + return convertPoint(dbzObj); default: LOG.debug( "Unsupported type: {} with name {}, transform value to string", @@ -211,6 +221,37 @@ private Object convert(Schema fieldSchema, Object dbzObj) { } } + private Object convertPoint(Object dbzObj) { + // the Geometry datatype in PostgreSQL will be converted to + // a String with Json format + try { + Struct geometryStruct = (Struct) dbzObj; + byte[] wkb = geometryStruct.getBytes("wkb"); + + String geoJson = OGCGeometry.fromBinary(ByteBuffer.wrap(wkb)).asGeoJson(); + JsonNode originGeoNode = objectMapper.readTree(geoJson); + + Optional srid = Optional.ofNullable(geometryStruct.getInt32("srid")); + Map geometryInfo = new HashMap<>(); + String geometryType = originGeoNode.get("type").asText(); + + geometryInfo.put("type", geometryType); + if ("GeometryCollection".equals(geometryType)) { + geometryInfo.put("geometries", originGeoNode.get("geometries")); + } else { + geometryInfo.put("coordinates", originGeoNode.get("coordinates")); + } + + geometryInfo.put("srid", srid.orElse(0)); + return objectMapper.writeValueAsString(geometryInfo); + } catch (Exception e) { + LOG.debug( + "Failed to parse Geometry datatype, converting the value to string {}", + dbzObj.toString()); + return dbzObj.toString(); + } + } + private Object convertZoneTimestamp(Object dbzObj) { if (dbzObj instanceof String) { String str = (String) dbzObj; diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/factory/DataSource.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/factory/DataSource.java index 20f1f94ecf9524..904cc32a53702e 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/factory/DataSource.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/factory/DataSource.java @@ -18,5 +18,6 @@ package org.apache.doris.cdcclient.source.factory; public enum DataSource { - MYSQL + MYSQL, + POSTGRES } diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/factory/SourceReaderFactory.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/factory/SourceReaderFactory.java index 5a7a91c6f96a56..216c2514f42e17 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/factory/SourceReaderFactory.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/factory/SourceReaderFactory.java @@ -19,6 +19,7 @@ import org.apache.doris.cdcclient.source.reader.SourceReader; import org.apache.doris.cdcclient.source.reader.mysql.MySqlSourceReader; +import org.apache.doris.cdcclient.source.reader.postgres.PostgresSourceReader; import java.util.Map; import java.util.Objects; @@ -36,11 +37,12 @@ public final class SourceReaderFactory { static { register(DataSource.MYSQL, MySqlSourceReader::new); + register(DataSource.POSTGRES, PostgresSourceReader::new); } private SourceReaderFactory() {} - public static void register(DataSource source, Supplier supplier) { + private static void register(DataSource source, Supplier supplier) { Objects.requireNonNull(source, "source"); Objects.requireNonNull(supplier, "supplier"); REGISTRY.put(source, supplier); diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/JdbcIncrementalSourceReader.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/JdbcIncrementalSourceReader.java new file mode 100644 index 00000000000000..f9e11f6b029aa3 --- /dev/null +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/JdbcIncrementalSourceReader.java @@ -0,0 +1,730 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cdcclient.source.reader; + +import org.apache.doris.cdcclient.common.Constants; +import org.apache.doris.cdcclient.source.deserialize.DebeziumJsonDeserializer; +import org.apache.doris.cdcclient.source.deserialize.SourceRecordDeserializer; +import org.apache.doris.cdcclient.source.factory.DataSource; +import org.apache.doris.job.cdc.DataSourceConfigKeys; +import org.apache.doris.job.cdc.request.FetchTableSplitsRequest; +import org.apache.doris.job.cdc.request.JobBaseConfig; +import org.apache.doris.job.cdc.request.JobBaseRecordRequest; +import org.apache.doris.job.cdc.split.AbstractSourceSplit; +import org.apache.doris.job.cdc.split.BinlogSplit; +import org.apache.doris.job.cdc.split.SnapshotSplit; + +import org.apache.commons.collections.CollectionUtils; +import org.apache.flink.api.connector.source.SourceSplit; +import org.apache.flink.api.connector.source.mocks.MockSplitEnumeratorContext; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.cdc.common.utils.Preconditions; +import org.apache.flink.cdc.connectors.base.config.JdbcSourceConfig; +import org.apache.flink.cdc.connectors.base.dialect.JdbcDataSourceDialect; +import org.apache.flink.cdc.connectors.base.options.StartupOptions; +import org.apache.flink.cdc.connectors.base.source.assigner.HybridSplitAssigner; +import org.apache.flink.cdc.connectors.base.source.assigner.SnapshotSplitAssigner; +import org.apache.flink.cdc.connectors.base.source.meta.offset.Offset; +import org.apache.flink.cdc.connectors.base.source.meta.offset.OffsetFactory; +import org.apache.flink.cdc.connectors.base.source.meta.split.FinishedSnapshotSplitInfo; +import org.apache.flink.cdc.connectors.base.source.meta.split.SnapshotSplitState; +import org.apache.flink.cdc.connectors.base.source.meta.split.SourceRecords; +import org.apache.flink.cdc.connectors.base.source.meta.split.SourceSplitBase; +import org.apache.flink.cdc.connectors.base.source.meta.split.SourceSplitState; +import org.apache.flink.cdc.connectors.base.source.meta.split.StreamSplit; +import org.apache.flink.cdc.connectors.base.source.meta.split.StreamSplitState; +import org.apache.flink.cdc.connectors.base.source.meta.wartermark.WatermarkEvent; +import org.apache.flink.cdc.connectors.base.source.reader.external.FetchTask; +import org.apache.flink.cdc.connectors.base.source.reader.external.Fetcher; +import org.apache.flink.cdc.connectors.base.source.reader.external.IncrementalSourceScanFetcher; +import org.apache.flink.cdc.connectors.base.source.reader.external.IncrementalSourceStreamFetcher; +import org.apache.flink.cdc.connectors.base.utils.SourceRecordUtils; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.kafka.connect.source.SourceRecord; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Optional; + +import static org.apache.flink.cdc.connectors.base.source.meta.split.StreamSplit.STREAM_SPLIT_ID; + +import com.fasterxml.jackson.databind.ObjectMapper; +import io.debezium.relational.Column; +import io.debezium.relational.TableId; +import io.debezium.relational.history.TableChanges; +import lombok.Data; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Data +public abstract class JdbcIncrementalSourceReader implements SourceReader { + private static final Logger LOG = LoggerFactory.getLogger(JdbcIncrementalSourceReader.class); + private static ObjectMapper objectMapper = new ObjectMapper(); + private SourceRecordDeserializer> serializer; + private IncrementalSourceScanFetcher snapshotReader; + private IncrementalSourceStreamFetcher binlogReader; + private Fetcher currentReader; + private Map tableSchemas; + private SplitRecords currentSplitRecords; + private SourceSplitBase currentSplit; + protected FetchTask currentFetchTask; + + public JdbcIncrementalSourceReader() { + this.serializer = new DebeziumJsonDeserializer(); + } + + @Override + public void initialize(long jobId, DataSource dataSource, Map config) { + this.serializer.init(config); + } + + @Override + public List getSourceSplits(FetchTableSplitsRequest ftsReq) { + LOG.info("Get table {} splits for job {}", ftsReq.getSnapshotTable(), ftsReq.getJobId()); + JdbcSourceConfig sourceConfig = getSourceConfig(ftsReq); + List + remainingSnapshotSplits = new ArrayList<>(); + StreamSplit remainingStreamSplit = null; + + // Check startup mode - for PostgreSQL, we use similar logic as MySQL + String startupMode = ftsReq.getConfig().get(DataSourceConfigKeys.OFFSET); + if (DataSourceConfigKeys.OFFSET_INITIAL.equalsIgnoreCase(startupMode)) { + remainingSnapshotSplits = + startSplitChunks(sourceConfig, ftsReq.getSnapshotTable(), ftsReq.getConfig()); + } else { + // For non-initial mode, create a stream split + Offset startingOffset = createInitialOffset(); + remainingStreamSplit = + new StreamSplit( + STREAM_SPLIT_ID, + startingOffset, + createNoStoppingOffset(), + new ArrayList<>(), + new HashMap<>(), + 0); + } + + List splits = new ArrayList<>(); + if (!remainingSnapshotSplits.isEmpty()) { + for (org.apache.flink.cdc.connectors.base.source.meta.split.SnapshotSplit + snapshotSplit : remainingSnapshotSplits) { + String splitId = snapshotSplit.splitId(); + String tableId = snapshotSplit.getTableId().identifier(); + Object[] splitStart = snapshotSplit.getSplitStart(); + Object[] splitEnd = snapshotSplit.getSplitEnd(); + List splitKey = snapshotSplit.getSplitKeyType().getFieldNames(); + SnapshotSplit split = + new SnapshotSplit(splitId, tableId, splitKey, splitStart, splitEnd, null); + splits.add(split); + } + } else { + Offset startingOffset = remainingStreamSplit.getStartingOffset(); + BinlogSplit streamSplit = new BinlogSplit(); + streamSplit.setSplitId(remainingStreamSplit.splitId()); + streamSplit.setStartingOffset(startingOffset.getOffset()); + splits.add(streamSplit); + } + return splits; + } + + @Override + public SplitReadResult readSplitRecords(JobBaseRecordRequest baseReq) throws Exception { + Map offsetMeta = baseReq.getMeta(); + if (offsetMeta == null || offsetMeta.isEmpty()) { + throw new RuntimeException("miss meta offset"); + } + LOG.info("Job {} read split records with offset: {}", baseReq.getJobId(), offsetMeta); + + // If there is an active split being consumed, reuse it directly; + // Otherwise, create a new snapshot/stream split based on offset and start the reader. + SourceSplitBase split = null; + SplitRecords currentSplitRecords = this.getCurrentSplitRecords(); + if (currentSplitRecords == null) { + Fetcher currentReader = this.getCurrentReader(); + if (currentReader == null || baseReq.isReload()) { + LOG.info( + "No current reader or reload {}, create new split reader", + baseReq.isReload()); + // build split + Tuple2 splitFlag = createSourceSplit(offsetMeta, baseReq); + split = splitFlag.f0; + closeBinlogReader(); + currentSplitRecords = pollSplitRecordsWithSplit(split, baseReq); + this.setCurrentSplitRecords(currentSplitRecords); + this.setCurrentSplit(split); + } else if (currentReader instanceof IncrementalSourceStreamFetcher) { + LOG.info("Continue poll records with current binlog reader"); + // only for binlog reader + currentSplitRecords = pollSplitRecordsWithCurrentReader(currentReader); + split = this.getCurrentSplit(); + } else { + throw new RuntimeException("Should not happen"); + } + } else { + LOG.info( + "Continue read records with current split records, splitId: {}", + currentSplitRecords.getSplitId()); + } + + // build response with iterator + SplitReadResult result = new SplitReadResult(); + SourceSplitState currentSplitState = null; + SourceSplitBase currentSplit = this.getCurrentSplit(); + if (currentSplit.isSnapshotSplit()) { + currentSplitState = new SnapshotSplitState(currentSplit.asSnapshotSplit()); + } else { + currentSplitState = new StreamSplitState(currentSplit.asStreamSplit()); + } + + Iterator filteredIterator = + new FilteredRecordIterator(currentSplitRecords, currentSplitState); + + result.setRecordIterator(filteredIterator); + result.setSplitState(currentSplitState); + result.setSplit(split); + return result; + } + + protected abstract DataType fromDbzColumn(Column splitColumn); + + protected abstract Fetcher getSnapshotSplitReader( + JobBaseConfig jobConfig); + + protected abstract Fetcher getBinlogSplitReader( + JobBaseConfig jobConfig); + + protected abstract OffsetFactory getOffsetFactory(); + + protected abstract Offset createOffset(Map offset); + + protected abstract Offset createInitialOffset(); + + protected abstract Offset createNoStoppingOffset(); + + protected abstract JdbcDataSourceDialect getDialect(JdbcSourceConfig sourceConfig); + + protected Tuple2 createSourceSplit( + Map offsetMeta, JobBaseConfig jobConfig) { + Tuple2 splitRes = null; + String splitId = String.valueOf(offsetMeta.get(SPLIT_ID)); + if (!BinlogSplit.BINLOG_SPLIT_ID.equals(splitId)) { + org.apache.flink.cdc.connectors.base.source.meta.split.SnapshotSplit split = + createSnapshotSplit(offsetMeta, jobConfig); + splitRes = Tuple2.of(split, false); + } else { + splitRes = createStreamSplit(offsetMeta, jobConfig); + } + return splitRes; + } + + private org.apache.flink.cdc.connectors.base.source.meta.split.SnapshotSplit + createSnapshotSplit(Map offset, JobBaseConfig jobConfig) { + SnapshotSplit snapshotSplit = objectMapper.convertValue(offset, SnapshotSplit.class); + TableId tableId = TableId.parse(snapshotSplit.getTableId(), false); + Object[] splitStart = snapshotSplit.getSplitStart(); + Object[] splitEnd = snapshotSplit.getSplitEnd(); + List splitKeys = snapshotSplit.getSplitKey(); + Map tableSchemas = getTableSchemas(jobConfig); + TableChanges.TableChange tableChange = tableSchemas.get(tableId); + Preconditions.checkNotNull( + tableChange, "Can not find table " + tableId + " in job " + jobConfig.getJobId()); + // only support one split key + String splitKey = splitKeys.get(0); + io.debezium.relational.Column splitColumn = tableChange.getTable().columnWithName(splitKey); + RowType splitType = getSplitType(splitColumn); + org.apache.flink.cdc.connectors.base.source.meta.split.SnapshotSplit split = + new org.apache.flink.cdc.connectors.base.source.meta.split.SnapshotSplit( + tableId, + snapshotSplit.getSplitId(), + splitType, + splitStart, + splitEnd, + null, + tableSchemas); + return split; + } + + private RowType getSplitType(Column splitColumn) { + return (RowType) + DataTypes.ROW( + new DataTypes.Field[] { + DataTypes.FIELD( + splitColumn.name(), this.fromDbzColumn(splitColumn)) + }) + .getLogicalType(); + } + + private Tuple2 createStreamSplit( + Map meta, JobBaseConfig config) { + BinlogSplit streamSplit = objectMapper.convertValue(meta, BinlogSplit.class); + List finishedSnapshotSplitInfos = new ArrayList<>(); + Offset minOffsetFinishSplits = null; + Offset maxOffsetFinishSplits = null; + if (CollectionUtils.isNotEmpty(streamSplit.getFinishedSplits())) { + List splitWithHW = streamSplit.getFinishedSplits(); + List assignedSplitLists = + splitWithHW.stream() + .sorted(Comparator.comparing(AbstractSourceSplit::getSplitId)) + .toList(); + + for (SnapshotSplit split : assignedSplitLists) { + // find the min offset + Map offsetMap = split.getHighWatermark(); + Offset sourceOffset = createOffset(offsetMap); + if (minOffsetFinishSplits == null || sourceOffset.isBefore(minOffsetFinishSplits)) { + minOffsetFinishSplits = sourceOffset; + } + if (maxOffsetFinishSplits == null || sourceOffset.isAfter(maxOffsetFinishSplits)) { + maxOffsetFinishSplits = sourceOffset; + } + finishedSnapshotSplitInfos.add( + new FinishedSnapshotSplitInfo( + TableId.parse(split.getTableId()), + split.getSplitId(), + split.getSplitStart(), + split.getSplitEnd(), + sourceOffset, + getOffsetFactory())); + } + } + + Offset startOffset; + Offset lastOffset = + createOffset( + streamSplit.getStartingOffset() == null + ? new HashMap<>() + : streamSplit.getStartingOffset()); + if (minOffsetFinishSplits != null && lastOffset.getOffset().isEmpty()) { + startOffset = minOffsetFinishSplits; + } else if (!lastOffset.getOffset().isEmpty()) { + lastOffset.getOffset().remove(SPLIT_ID); + startOffset = lastOffset; + } else { + // The input offset from params is empty + JdbcSourceConfig sourceConfig = getSourceConfig(config); + startOffset = getStartOffsetFromConfig(sourceConfig); + } + + boolean pureStreamPhase = false; + if (maxOffsetFinishSplits == null) { + pureStreamPhase = true; + } else if (startOffset.isAtOrAfter(maxOffsetFinishSplits)) { + // All the offsets of the current split are smaller than the offset of the stream, + // indicating that the stream phase has been fully entered. + pureStreamPhase = true; + LOG.info( + "The stream phase has been fully entered, the current split is: {}", + startOffset); + } + + StreamSplit split = + new StreamSplit( + STREAM_SPLIT_ID, + startOffset, + createNoStoppingOffset(), + finishedSnapshotSplitInfos, + new HashMap<>(), + 0); + // filterTableSchema + StreamSplit streamSplitFinal = + StreamSplit.fillTableSchemas(split.asStreamSplit(), getTableSchemas(config)); + return Tuple2.of(streamSplitFinal, pureStreamPhase); + } + + private Offset getStartOffsetFromConfig(JdbcSourceConfig sourceConfig) { + StartupOptions startupOptions = sourceConfig.getStartupOptions(); + Offset startingOffset; + switch (startupOptions.startupMode) { + case LATEST_OFFSET: + startingOffset = getDialect(sourceConfig).displayCurrentOffset(sourceConfig); + break; + case EARLIEST_OFFSET: + startingOffset = createInitialOffset(); + break; + case TIMESTAMP: + case SPECIFIC_OFFSETS: + case COMMITTED_OFFSETS: + default: + throw new IllegalStateException( + "Unsupported startup mode " + startupOptions.startupMode); + } + return startingOffset; + } + + private List + startSplitChunks( + JdbcSourceConfig sourceConfig, + String snapshotTable, + Map config) { + List remainingTables = new ArrayList<>(); + if (snapshotTable != null) { + String schema = config.get(DataSourceConfigKeys.SCHEMA); + remainingTables.add(new TableId(null, schema, snapshotTable)); + } + List remainingSplits = + new ArrayList<>(); + HybridSplitAssigner splitAssigner = + new HybridSplitAssigner<>( + sourceConfig, + 1, + remainingTables, + true, + getDialect(sourceConfig), + getOffsetFactory(), + new MockSplitEnumeratorContext(1)); + splitAssigner.open(); + try { + while (true) { + Optional split = splitAssigner.getNext(); + if (split.isPresent()) { + org.apache.flink.cdc.connectors.base.source.meta.split.SnapshotSplit + snapshotSplit = split.get().asSnapshotSplit(); + remainingSplits.add(snapshotSplit); + } else { + break; + } + } + } finally { + closeChunkSplitterOnly(splitAssigner); + } + return remainingSplits; + } + + /** + * Close only the chunk splitter to avoid closing shared connection pools Similar to MySQL + * implementation Note: HybridSplitAssigner wraps SnapshotSplitAssigner, so we need to get the + * inner assigner first + */ + private static void closeChunkSplitterOnly(HybridSplitAssigner splitAssigner) { + try { + // First, get the inner SnapshotSplitAssigner from HybridSplitAssigner + java.lang.reflect.Field snapshotAssignerField = + HybridSplitAssigner.class.getDeclaredField("snapshotSplitAssigner"); + snapshotAssignerField.setAccessible(true); + SnapshotSplitAssigner snapshotSplitAssigner = + (SnapshotSplitAssigner) snapshotAssignerField.get(splitAssigner); + + if (snapshotSplitAssigner == null) { + LOG.warn("snapshotSplitAssigner is null in HybridSplitAssigner"); + return; + } + + // Call closeExecutorService() via reflection + java.lang.reflect.Method closeExecutorMethod = + SnapshotSplitAssigner.class.getDeclaredMethod("closeExecutorService"); + closeExecutorMethod.setAccessible(true); + closeExecutorMethod.invoke(snapshotSplitAssigner); + + // Call chunkSplitter.close() via reflection + java.lang.reflect.Field chunkSplitterField = + SnapshotSplitAssigner.class.getDeclaredField("chunkSplitter"); + chunkSplitterField.setAccessible(true); + Object chunkSplitter = chunkSplitterField.get(snapshotSplitAssigner); + + if (chunkSplitter != null) { + java.lang.reflect.Method closeMethod = chunkSplitter.getClass().getMethod("close"); + closeMethod.invoke(chunkSplitter); + LOG.info("Closed Source chunkSplitter JDBC connection"); + } + } catch (Exception e) { + LOG.warn("Failed to close chunkSplitter via reflection", e); + } + } + + private SplitRecords pollSplitRecordsWithSplit(SourceSplitBase split, JobBaseConfig jobConfig) + throws Exception { + Preconditions.checkState(split != null, "split is null"); + SourceRecords sourceRecords = null; + String currentSplitId = null; + Fetcher currentReader = null; + LOG.info("Get a split: {}", split.splitId()); + if (split.isSnapshotSplit()) { + currentReader = getSnapshotSplitReader(jobConfig); + } else if (split.isStreamSplit()) { + currentReader = getBinlogSplitReader(jobConfig); + } + this.setCurrentReader(currentReader); + FetchTask splitFetchTask = createFetchTaskFromSplit(jobConfig, split); + currentReader.submitTask(splitFetchTask); + currentSplitId = split.splitId(); + this.setCurrentFetchTask(splitFetchTask); + // make split record available + sourceRecords = + pollUntilDataAvailable(currentReader, Constants.POLL_SPLIT_RECORDS_TIMEOUTS, 500); + if (currentReader instanceof IncrementalSourceScanFetcher) { + closeSnapshotReader(); + } + return new SplitRecords(currentSplitId, sourceRecords.iterator()); + } + + private SplitRecords pollSplitRecordsWithCurrentReader( + Fetcher currentReader) throws Exception { + Iterator dataIt = null; + if (currentReader instanceof IncrementalSourceStreamFetcher) { + dataIt = currentReader.pollSplitRecords(); + return dataIt == null + ? null + : new SplitRecords(STREAM_SPLIT_ID, dataIt.next().iterator()); + } else { + throw new IllegalStateException("Unsupported reader type."); + } + } + + /** + * Split tasks are submitted asynchronously, and data is sent to the Debezium queue. Therefore, + * there will be a time interval between retrieving data; it's necessary to fetch data until the + * queue has data. + */ + private SourceRecords pollUntilDataAvailable( + Fetcher reader, long maxWaitTimeMs, long pollIntervalMs) + throws InterruptedException { + long startTime = System.currentTimeMillis(); + long elapsedTime = 0; + int attemptCount = 0; + LOG.info("Polling until data available"); + Iterator lastDataIt = null; + while (elapsedTime < maxWaitTimeMs) { + attemptCount++; + lastDataIt = reader.pollSplitRecords(); + if (lastDataIt != null && lastDataIt.hasNext()) { + SourceRecords sourceRecords = lastDataIt.next(); + if (sourceRecords != null && !sourceRecords.getSourceRecordList().isEmpty()) { + LOG.info( + "Data available after {} ms ({} attempts). {} Records received.", + elapsedTime, + attemptCount, + sourceRecords.getSourceRecordList().size()); + // todo: poll until heartbeat ? + return sourceRecords; + } + } + + // No records yet, continue polling + if (elapsedTime + pollIntervalMs < maxWaitTimeMs) { + Thread.sleep(pollIntervalMs); + elapsedTime = System.currentTimeMillis() - startTime; + } else { + // Last attempt before timeout + break; + } + } + + LOG.warn( + "Timeout: No data (heartbeat or data change) received after {} ms ({} attempts).", + elapsedTime, + attemptCount); + return new SourceRecords(new ArrayList<>()); + } + + private void closeSnapshotReader() { + IncrementalSourceScanFetcher reusedSnapshotReader = this.getSnapshotReader(); + if (reusedSnapshotReader != null) { + LOG.info( + "Close snapshot reader {}", reusedSnapshotReader.getClass().getCanonicalName()); + reusedSnapshotReader.close(); + Fetcher currentReader = this.getCurrentReader(); + if (reusedSnapshotReader == currentReader) { + this.setCurrentReader(null); + } + this.setSnapshotReader(null); + } + } + + private void closeBinlogReader() { + IncrementalSourceStreamFetcher reusedBinlogReader = this.getBinlogReader(); + if (reusedBinlogReader != null) { + LOG.info("Close binlog reader {}", reusedBinlogReader.getClass().getCanonicalName()); + reusedBinlogReader.close(); + Fetcher currentReader = this.getCurrentReader(); + if (reusedBinlogReader == currentReader) { + this.setCurrentReader(null); + } + this.setBinlogReader(null); + } + } + + protected abstract FetchTask createFetchTaskFromSplit( + JobBaseConfig jobConfig, SourceSplitBase split); + + /** Get source config - to be implemented by subclasses */ + protected abstract JdbcSourceConfig getSourceConfig(JobBaseConfig config); + + @Override + public Map extractSnapshotStateOffset(Object splitState) { + Preconditions.checkNotNull(splitState, "splitState is null"); + SourceSplitState sourceSplitState = (SourceSplitState) splitState; + Offset highWatermark = sourceSplitState.asSnapshotSplitState().getHighWatermark(); + Map offsetRes = new HashMap<>(highWatermark.getOffset()); + return offsetRes; + } + + @Override + public Map extractBinlogStateOffset(Object splitState) { + Preconditions.checkNotNull(splitState, "splitState is null"); + SourceSplitState sourceSplitState = (SourceSplitState) splitState; + Offset startingOffset = sourceSplitState.asStreamSplitState().getStartingOffset(); + return new HashMap<>(startingOffset.getOffset()); + } + + @Override + public Map extractBinlogOffset(SourceSplit split) { + Preconditions.checkNotNull(split, "split is null"); + SourceSplitBase postgresSplit = (SourceSplitBase) split; + Map offsetRes = + new HashMap<>(postgresSplit.asStreamSplit().getStartingOffset().getOffset()); + return offsetRes; + } + + @Override + public boolean isBinlogSplit(SourceSplit split) { + Preconditions.checkNotNull(split, "split is null"); + SourceSplitBase postgresSplit = (SourceSplitBase) split; + return postgresSplit.isStreamSplit(); + } + + @Override + public boolean isSnapshotSplit(SourceSplit split) { + Preconditions.checkNotNull(split, "split is null"); + SourceSplitBase postgresSplit = (SourceSplitBase) split; + return postgresSplit.isSnapshotSplit(); + } + + @Override + public void finishSplitRecords() { + this.setCurrentSplitRecords(null); + } + + private Map getTableSchemas(JobBaseConfig config) { + Map schemas = this.getTableSchemas(); + if (schemas == null) { + schemas = discoverTableSchemas(config); + this.setTableSchemas(schemas); + } + return schemas; + } + + protected abstract Map discoverTableSchemas( + JobBaseConfig config); + + @Override + public void close(JobBaseConfig jobConfig) { + LOG.info("Close source reader for job {}", jobConfig.getJobId()); + closeSnapshotReader(); + closeBinlogReader(); + currentReader = null; + currentSplitRecords = null; + currentSplit = null; + if (tableSchemas != null) { + tableSchemas.clear(); + tableSchemas = null; + } + } + + @Override + public List deserialize(Map config, SourceRecord element) + throws IOException { + return serializer.deserialize(config, element); + } + + /** + * Filtered record iterator that only returns data change records, filtering out watermark, + * heartbeat and other events. This is a private inner class that encapsulates record filtering + * logic, making the main method cleaner. + */ + private class FilteredRecordIterator implements Iterator { + private final Iterator sourceIterator; + private final SourceSplitState splitState; + private SourceRecord nextRecord; + + FilteredRecordIterator(SplitRecords currentSplitRecords, SourceSplitState splitState) { + this.sourceIterator = + currentSplitRecords != null && !currentSplitRecords.isEmpty() + ? currentSplitRecords.getIterator() + : null; + this.splitState = splitState; + } + + @Override + public boolean hasNext() { + if (sourceIterator == null) { + return false; + } + if (nextRecord != null) { + return true; + } + + while (sourceIterator.hasNext()) { + SourceRecord element = sourceIterator.next(); + if (WatermarkEvent.isWatermarkEvent(element)) { + Offset watermark = getWatermark(element); + if (WatermarkEvent.isHighWatermarkEvent(element) + && splitState.isSnapshotSplitState()) { + splitState.asSnapshotSplitState().setHighWatermark(watermark); + } + } else if (SourceRecordUtils.isHeartbeatEvent(element)) { + LOG.debug("Receive heartbeat event: {}", element); + if (splitState.isStreamSplitState()) { + Offset position = createOffset(element.sourceOffset()); + splitState.asStreamSplitState().setStartingOffset(position); + } + } else if (SourceRecordUtils.isDataChangeRecord(element)) { + if (splitState.isStreamSplitState()) { + Offset position = createOffset(element.sourceOffset()); + splitState.asStreamSplitState().setStartingOffset(position); + } + nextRecord = element; + return true; + } else { + LOG.debug("Ignore event: {}", element); + } + } + return false; + } + + @Override + public SourceRecord next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + SourceRecord record = nextRecord; + nextRecord = null; + return record; + } + + private Offset getWatermark(SourceRecord watermarkEvent) { + Map offset = watermarkEvent.sourceOffset(); + // Extract watermark from source record offset + OffsetFactory offsetFactory = getOffsetFactory(); + Map offsetStrMap = new HashMap<>(); + for (Map.Entry entry : offset.entrySet()) { + offsetStrMap.put( + entry.getKey(), + entry.getValue() == null ? null : entry.getValue().toString()); + } + return offsetFactory.newOffset(offsetStrMap); + } + } +} diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/SourceReader.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/SourceReader.java index 373d10a0ef3f37..d5feeef45a32c8 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/SourceReader.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/SourceReader.java @@ -17,9 +17,8 @@ package org.apache.doris.cdcclient.source.reader; -import org.apache.doris.cdcclient.model.response.RecordWithMeta; +import org.apache.doris.cdcclient.source.factory.DataSource; import org.apache.doris.job.cdc.request.CompareOffsetRequest; -import org.apache.doris.job.cdc.request.FetchRecordRequest; import org.apache.doris.job.cdc.request.FetchTableSplitsRequest; import org.apache.doris.job.cdc.request.JobBaseConfig; import org.apache.doris.job.cdc.request.JobBaseRecordRequest; @@ -34,23 +33,29 @@ /** Source Reader Interface */ public interface SourceReader { + String SPLIT_ID = "splitId"; + /** Initialization, called when the program starts */ - void initialize(Map config); + void initialize(long jobId, DataSource dataSource, Map config); /** Divide the data to be read. For example: split mysql to chunks */ List getSourceSplits(FetchTableSplitsRequest config); - /** Reading Data */ - RecordWithMeta read(FetchRecordRequest meta) throws Exception; - - /** Reading Data for split reader */ + /** + * 1. If the SplitRecords iterator has it, read the iterator directly. 2. If there is a stream + * reader, poll it. 3. If there is none, resubmit split. 4. If reload is true, need to reset + * streamSplitReader and submit split. + */ SplitReadResult readSplitRecords(JobBaseRecordRequest baseReq) throws Exception; /** Extract offset information from snapshot split state. */ - Map extractSnapshotOffset(SourceSplit split, Object splitState); + Map extractSnapshotStateOffset(Object splitState); + + /** Extract offset information from binlog split states. */ + Map extractBinlogStateOffset(Object splitState); /** Extract offset information from binlog split. */ - Map extractBinlogOffset(SourceSplit split); + Map extractBinlogOffset(SourceSplit splitState); /** Is the split a binlog split */ boolean isBinlogSplit(SourceSplit split); @@ -68,7 +73,13 @@ public interface SourceReader { int compareOffset(CompareOffsetRequest compareOffsetRequest); /** Called when closing */ - void close(Long jobId); + void close(JobBaseConfig jobConfig); List deserialize(Map config, SourceRecord element) throws IOException; + + /** + * Commits the given offset with the source database. Used by some source like Postgres to + * indicate how far the source TX log can be discarded. + */ + default void commitSourceOffset(Long jobId, SourceSplit sourceSplit) {} } diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/SplitRecords.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/SplitRecords.java index d7c712e463d116..b6afd3575c6210 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/SplitRecords.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/SplitRecords.java @@ -17,44 +17,33 @@ package org.apache.doris.cdcclient.source.reader; -import org.apache.flink.cdc.connectors.mysql.source.split.SourceRecords; import org.apache.kafka.connect.source.SourceRecord; import java.util.Iterator; public class SplitRecords { private final String splitId; - private final SourceRecords records; - private Iterator iterator; + private final Iterator iterator; - public SplitRecords(String splitId, SourceRecords records) { + public SplitRecords(String splitId, Iterator iterator) { this.splitId = splitId; - this.records = records; - this.iterator = records.iterator(); + this.iterator = iterator; } public String getSplitId() { return splitId; } - public SourceRecords getRecords() { - return records; - } - public Iterator getIterator() { return iterator; } - public void setIterator(Iterator iterator) { - this.iterator = iterator; - } - public boolean isEmpty() { - return splitId == null || records == null || !records.iterator().hasNext(); + return splitId == null || !iterator.hasNext(); } @Override public String toString() { - return "SplitRecords{" + "split=" + splitId + ", records=" + records + '}'; + return "SplitRecords{" + "split=" + splitId + '}'; } } diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/mysql/MySqlSourceReader.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/mysql/MySqlSourceReader.java index d9ff9ab81061e9..27fbf3be88b363 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/mysql/MySqlSourceReader.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/mysql/MySqlSourceReader.java @@ -17,16 +17,16 @@ package org.apache.doris.cdcclient.source.reader.mysql; -import org.apache.doris.cdcclient.model.response.RecordWithMeta; +import org.apache.doris.cdcclient.common.Constants; import org.apache.doris.cdcclient.source.deserialize.DebeziumJsonDeserializer; import org.apache.doris.cdcclient.source.deserialize.SourceRecordDeserializer; +import org.apache.doris.cdcclient.source.factory.DataSource; import org.apache.doris.cdcclient.source.reader.SourceReader; import org.apache.doris.cdcclient.source.reader.SplitReadResult; import org.apache.doris.cdcclient.source.reader.SplitRecords; import org.apache.doris.cdcclient.utils.ConfigUtil; import org.apache.doris.job.cdc.DataSourceConfigKeys; import org.apache.doris.job.cdc.request.CompareOffsetRequest; -import org.apache.doris.job.cdc.request.FetchRecordRequest; import org.apache.doris.job.cdc.request.FetchTableSplitsRequest; import org.apache.doris.job.cdc.request.JobBaseConfig; import org.apache.doris.job.cdc.request.JobBaseRecordRequest; @@ -35,6 +35,7 @@ import org.apache.doris.job.cdc.split.SnapshotSplit; import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.collections.MapUtils; import org.apache.flink.api.connector.source.SourceSplit; import org.apache.flink.api.connector.source.mocks.MockSplitEnumeratorContext; import org.apache.flink.api.java.tuple.Tuple2; @@ -46,7 +47,9 @@ import org.apache.flink.cdc.connectors.mysql.debezium.task.context.StatefulTaskContext; import org.apache.flink.cdc.connectors.mysql.source.assigners.MySqlSnapshotSplitAssigner; import org.apache.flink.cdc.connectors.mysql.source.config.MySqlSourceConfig; +import org.apache.flink.cdc.connectors.mysql.source.config.MySqlSourceConfigFactory; import org.apache.flink.cdc.connectors.mysql.source.offset.BinlogOffset; +import org.apache.flink.cdc.connectors.mysql.source.offset.BinlogOffsetUtils; import org.apache.flink.cdc.connectors.mysql.source.split.FinishedSnapshotSplitInfo; import org.apache.flink.cdc.connectors.mysql.source.split.MySqlBinlogSplit; import org.apache.flink.cdc.connectors.mysql.source.split.MySqlBinlogSplitState; @@ -59,6 +62,7 @@ import org.apache.flink.cdc.connectors.mysql.source.utils.RecordUtils; import org.apache.flink.cdc.connectors.mysql.source.utils.TableDiscoveryUtils; import org.apache.flink.cdc.connectors.mysql.table.StartupMode; +import org.apache.flink.cdc.connectors.mysql.table.StartupOptions; import org.apache.flink.cdc.debezium.history.FlinkJsonTableChangeSerializer; import org.apache.flink.table.types.logical.RowType; import org.apache.kafka.connect.source.SourceRecord; @@ -66,6 +70,7 @@ import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; @@ -73,13 +78,18 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; +import java.util.Properties; import java.util.concurrent.ConcurrentHashMap; +import static org.apache.doris.cdcclient.utils.ConfigUtil.is13Timestamp; +import static org.apache.doris.cdcclient.utils.ConfigUtil.isJson; +import static org.apache.doris.cdcclient.utils.ConfigUtil.toStringMap; import static org.apache.flink.cdc.connectors.mysql.source.assigners.MySqlBinlogSplitAssigner.BINLOG_SPLIT_ID; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.github.shyiko.mysql.binlog.BinaryLogClient; +import com.mysql.cj.conf.ConnectionUrl; import io.debezium.connector.mysql.MySqlConnection; import io.debezium.connector.mysql.MySqlPartition; import io.debezium.document.Array; @@ -95,7 +105,6 @@ public class MySqlSourceReader implements SourceReader { private static final Logger LOG = LoggerFactory.getLogger(MySqlSourceReader.class); private static ObjectMapper objectMapper = new ObjectMapper(); - private static final String SPLIT_ID = "splitId"; private static final FlinkJsonTableChangeSerializer TABLE_CHANGE_SERIALIZER = new FlinkJsonTableChangeSerializer(); private SourceRecordDeserializer> serializer; @@ -111,7 +120,7 @@ public MySqlSourceReader() { } @Override - public void initialize(Map config) { + public void initialize(long jobId, DataSource dataSource, Map config) { this.serializer.init(config); } @@ -157,18 +166,6 @@ public List getSourceSplits(FetchTableSplitsRequest ftsReq) return splits; } - /** - * 1. If the SplitRecords iterator has it, read the iterator directly. 2. If there is a - * binlogreader, poll it. 3. If there is none, resubmit split. 4. If reload is true, need to - * reset binlogSplitReader and submit split. - */ - @Override - public RecordWithMeta read(FetchRecordRequest fetchRecord) throws Exception { - SplitReadResult readResult = readSplitRecords(fetchRecord); - return buildRecordResponse(fetchRecord, readResult); - } - - /** read split records. */ @Override public SplitReadResult readSplitRecords(JobBaseRecordRequest baseReq) throws Exception { Map offsetMeta = baseReq.getMeta(); @@ -228,61 +225,6 @@ public SplitReadResult readSplitRecords(JobBaseRecordRequest baseReq) throws Exc return result; } - /** build RecordWithMeta */ - private RecordWithMeta buildRecordResponse( - FetchRecordRequest fetchRecord, SplitReadResult readResult) throws Exception { - RecordWithMeta recordResponse = new RecordWithMeta(); - SourceSplit split = readResult.getSplit(); - int count = 0; - try { - // Serialize records and add them to the response (collect from iterator) - Iterator iterator = readResult.getRecordIterator(); - while (iterator != null && iterator.hasNext()) { - SourceRecord element = iterator.next(); - List serializedRecords = - serializer.deserialize(fetchRecord.getConfig(), element); - if (!CollectionUtils.isEmpty(serializedRecords)) { - recordResponse.getRecords().addAll(serializedRecords); - count += serializedRecords.size(); - // update meta - Map lastMeta = - RecordUtils.getBinlogPosition(element).getOffset(); - if (isBinlogSplit(split)) { - lastMeta.put(SPLIT_ID, BINLOG_SPLIT_ID); - recordResponse.setMeta(lastMeta); - } - if (count >= fetchRecord.getFetchSize()) { - return recordResponse; - } - } - } - } finally { - finishSplitRecords(); - } - - // Set meta information - if (isSnapshotSplit(split) && readResult.getSplitState() != null) { - Map offsetRes = - extractSnapshotOffset(split, readResult.getSplitState()); - recordResponse.setMeta(offsetRes); - return recordResponse; - } - if (CollectionUtils.isEmpty(recordResponse.getRecords())) { - if (isBinlogSplit(split)) { - Map offsetRes = extractBinlogOffset(readResult.getSplit()); - recordResponse.setMeta(offsetRes); - } else { - SnapshotSplit snapshotSplit = - objectMapper.convertValue(fetchRecord.getMeta(), SnapshotSplit.class); - Map meta = new HashMap<>(); - meta.put(SPLIT_ID, snapshotSplit.getSplitId()); - // chunk no data - recordResponse.setMeta(meta); - } - } - return recordResponse; - } - /** * refresh table changes after schema change * @@ -488,7 +430,7 @@ private void closeChunkSplitterOnly(MySqlSnapshotSplitAssigner splitAssigner) { private SplitRecords pollSplitRecordsWithSplit(MySqlSplit split, JobBaseConfig jobConfig) throws Exception { Preconditions.checkState(split != null, "split is null"); - Iterator dataIt = null; + SourceRecords sourceRecords = null; String currentSplitId = null; DebeziumReader currentReader = null; LOG.info("Get a split: {}", split.splitId()); @@ -501,13 +443,12 @@ private SplitRecords pollSplitRecordsWithSplit(MySqlSplit split, JobBaseConfig j currentReader.submitSplit(split); currentSplitId = split.splitId(); // make split record available - // todo: Until debezium_heartbeat is consumed - Thread.sleep(1000); - dataIt = currentReader.pollSplitRecords(); + sourceRecords = + pollUntilDataAvailable(currentReader, Constants.POLL_SPLIT_RECORDS_TIMEOUTS, 500); if (currentReader instanceof SnapshotSplitReader) { closeSnapshotReader(); } - return dataIt == null ? null : new SplitRecords(currentSplitId, dataIt.next()); + return new SplitRecords(currentSplitId, sourceRecords.iterator()); } private SplitRecords pollSplitRecordsWithCurrentReader( @@ -515,12 +456,62 @@ private SplitRecords pollSplitRecordsWithCurrentReader( Iterator dataIt = null; if (currentReader instanceof BinlogSplitReader) { dataIt = currentReader.pollSplitRecords(); - return dataIt == null ? null : new SplitRecords(BINLOG_SPLIT_ID, dataIt.next()); + return dataIt == null + ? null + : new SplitRecords(BINLOG_SPLIT_ID, dataIt.next().iterator()); } else { throw new IllegalStateException("Unsupported reader type."); } } + /** + * Split tasks are submitted asynchronously, and data is sent to the Debezium queue. Therefore, + * there will be a time interval between retrieving data; it's necessary to fetch data until the + * queue has data. + */ + private SourceRecords pollUntilDataAvailable( + DebeziumReader reader, + long maxWaitTimeMs, + long pollIntervalMs) + throws InterruptedException { + long startTime = System.currentTimeMillis(); + long elapsedTime = 0; + int attemptCount = 0; + LOG.info("Polling until data available"); + Iterator lastDataIt = null; + while (elapsedTime < maxWaitTimeMs) { + attemptCount++; + lastDataIt = reader.pollSplitRecords(); + if (lastDataIt != null && lastDataIt.hasNext()) { + SourceRecords sourceRecords = lastDataIt.next(); + if (sourceRecords != null && !sourceRecords.getSourceRecordList().isEmpty()) { + LOG.info( + "Data available after {} ms ({} attempts). {} Records received.", + elapsedTime, + attemptCount, + sourceRecords.getSourceRecordList().size()); + // todo: Until debezium_heartbeat is consumed + return sourceRecords; + } + } + + // No records yet, continue polling + if (elapsedTime + pollIntervalMs < maxWaitTimeMs) { + Thread.sleep(pollIntervalMs); + elapsedTime = System.currentTimeMillis() - startTime; + } else { + // Last attempt before timeout + break; + } + } + + LOG.warn( + "Timeout: No data (heartbeat or data change) received after {} ms ({} attempts).", + elapsedTime, + attemptCount); + return new SourceRecords(new ArrayList<>()); + } + private SnapshotSplitReader getSnapshotSplitReader(JobBaseConfig config) { MySqlSourceConfig sourceConfig = getSourceConfig(config); SnapshotSplitReader snapshotReader = this.getSnapshotReader(); @@ -581,27 +572,132 @@ private void closeBinlogReader() { } private MySqlSourceConfig getSourceConfig(JobBaseConfig config) { - return ConfigUtil.generateMySqlConfig(config); + return generateMySqlConfig(config); + } + + /** Generate MySQL source config from JobBaseConfig */ + private MySqlSourceConfig generateMySqlConfig(JobBaseConfig config) { + return generateMySqlConfig(config.getConfig(), ConfigUtil.getServerId(config.getJobId())); + } + + /** Generate MySQL source config from Map config */ + private MySqlSourceConfig generateMySqlConfig(Map cdcConfig, String serverId) { + MySqlSourceConfigFactory configFactory = new MySqlSourceConfigFactory(); + ConnectionUrl cu = + ConnectionUrl.getConnectionUrlInstance( + cdcConfig.get(DataSourceConfigKeys.JDBC_URL), null); + configFactory.hostname(cu.getMainHost().getHost()); + configFactory.port(cu.getMainHost().getPort()); + configFactory.username(cdcConfig.get(DataSourceConfigKeys.USER)); + configFactory.password(cdcConfig.get(DataSourceConfigKeys.PASSWORD)); + String databaseName = cdcConfig.get(DataSourceConfigKeys.DATABASE); + configFactory.databaseList(databaseName); + configFactory.serverId(serverId); + configFactory.serverTimeZone( + ConfigUtil.getTimeZoneFromProps(cu.getOriginalProperties()).toString()); + + configFactory.includeSchemaChanges(false); + + String includingTables = cdcConfig.get(DataSourceConfigKeys.INCLUDE_TABLES); + String[] includingTbls = + Arrays.stream(includingTables.split(",")) + .map(t -> databaseName + "." + t.trim()) + .toArray(String[]::new); + configFactory.tableList(includingTbls); + + // setting startMode + String startupMode = cdcConfig.get(DataSourceConfigKeys.OFFSET); + if (DataSourceConfigKeys.OFFSET_INITIAL.equalsIgnoreCase(startupMode)) { + // do not need set offset when initial + // configFactory.startupOptions(StartupOptions.initial()); + } else if (DataSourceConfigKeys.OFFSET_EARLIEST.equalsIgnoreCase(startupMode)) { + configFactory.startupOptions(StartupOptions.earliest()); + BinlogOffset binlogOffset = + initializeEffectiveOffset( + configFactory, StartupOptions.earliest().binlogOffset); + configFactory.startupOptions(StartupOptions.specificOffset(binlogOffset)); + } else if (DataSourceConfigKeys.OFFSET_LATEST.equalsIgnoreCase(startupMode)) { + configFactory.startupOptions(StartupOptions.latest()); + BinlogOffset binlogOffset = + initializeEffectiveOffset(configFactory, StartupOptions.latest().binlogOffset); + configFactory.startupOptions(StartupOptions.specificOffset(binlogOffset)); + } else if (isJson(startupMode)) { + // start from specific offset + Map offsetMap = toStringMap(startupMode); + if (MapUtils.isEmpty(offsetMap)) { + throw new RuntimeException("Incorrect offset " + startupMode); + } + if (offsetMap.containsKey(BinlogOffset.BINLOG_FILENAME_OFFSET_KEY) + && offsetMap.containsKey(BinlogOffset.BINLOG_POSITION_OFFSET_KEY)) { + BinlogOffset binlogOffset = new BinlogOffset(offsetMap); + configFactory.startupOptions(StartupOptions.specificOffset(binlogOffset)); + } else { + throw new RuntimeException("Incorrect offset " + startupMode); + } + } else if (is13Timestamp(startupMode)) { + // start from timestamp + Long ts = Long.parseLong(startupMode); + BinlogOffset binlogOffset = + initializeEffectiveOffset( + configFactory, StartupOptions.timestamp(ts).binlogOffset); + configFactory.startupOptions(StartupOptions.specificOffset(binlogOffset)); + } else { + throw new RuntimeException("Unknown offset " + startupMode); + } + + Properties jdbcProperteis = new Properties(); + jdbcProperteis.putAll(cu.getOriginalProperties()); + configFactory.jdbcProperties(jdbcProperteis); + + // Properties dbzProps = new Properties(); + // dbzProps.setProperty( + // MySqlConnectorConfig.KEEP_ALIVE_INTERVAL_MS.name(), + // String.valueOf(Constants.DEBEZIUM_HEARTBEAT_INTERVAL_MS)); + // configFactory.debeziumProperties(dbzProps); + // + // configFactory.heartbeatInterval( + // Duration.ofMillis(Constants.DEBEZIUM_HEARTBEAT_INTERVAL_MS)); + if (cdcConfig.containsKey(DataSourceConfigKeys.SPLIT_SIZE)) { + configFactory.splitSize( + Integer.parseInt(cdcConfig.get(DataSourceConfigKeys.SPLIT_SIZE))); + } + + return configFactory.createConfig(0); + } + + private BinlogOffset initializeEffectiveOffset( + MySqlSourceConfigFactory configFactory, BinlogOffset binlogOffset) { + MySqlSourceConfig config = configFactory.createConfig(0); + try (MySqlConnection connection = DebeziumUtils.createMySqlConnection(config)) { + return BinlogOffsetUtils.initializeEffectiveOffset(binlogOffset, connection, config); + } catch (SQLException e) { + throw new RuntimeException(e); + } } @Override - public Map extractSnapshotOffset(SourceSplit split, Object splitState) { - Preconditions.checkNotNull(split, "split is null"); + public Map extractSnapshotStateOffset(Object splitState) { Preconditions.checkNotNull(splitState, "splitState is null"); MySqlSplitState mysqlSplitState = (MySqlSplitState) splitState; - MySqlSplit mysqlSplit = (MySqlSplit) split; BinlogOffset highWatermark = mysqlSplitState.asSnapshotSplitState().getHighWatermark(); Map offsetRes = new HashMap<>(highWatermark.getOffset()); - offsetRes.put(SPLIT_ID, mysqlSplit.splitId()); return offsetRes; } + @Override + public Map extractBinlogStateOffset(Object splitState) { + Preconditions.checkNotNull(splitState, "splitState is null"); + MySqlSplitState mysqlSplitState = (MySqlSplitState) splitState; + BinlogOffset startingOffset = mysqlSplitState.asBinlogSplitState().getStartingOffset(); + return new HashMap<>(startingOffset.getOffset()); + } + @Override public Map extractBinlogOffset(SourceSplit split) { Preconditions.checkNotNull(split, "split is null"); MySqlSplit mysqlSplit = (MySqlSplit) split; - Map offsetRes = mysqlSplit.asBinlogSplit().getStartingOffset().getOffset(); - offsetRes.put(SPLIT_ID, BINLOG_SPLIT_ID); + Map offsetRes = + new HashMap<>(mysqlSplit.asBinlogSplit().getStartingOffset().getOffset()); return offsetRes; } @@ -676,14 +772,16 @@ private Map discoverTableSchemas(JobBaseConfi } @Override - public void close(Long jobId) { - LOG.info("Close source reader for job {}", jobId); + public void close(JobBaseConfig jobConfig) { + LOG.info("Close source reader for job {}", jobConfig.getJobId()); closeSnapshotReader(); closeBinlogReader(); currentReader = null; currentSplitRecords = null; - tableSchemas.clear(); - tableSchemas = null; + if (tableSchemas != null) { + tableSchemas.clear(); + tableSchemas = null; + } } @Override @@ -734,6 +832,10 @@ public boolean hasNext() { splitState.asBinlogSplitState().setStartingOffset(position); } } else if (RecordUtils.isDataChangeRecord(element)) { + if (splitState.isBinlogSplitState()) { + BinlogOffset position = RecordUtils.getBinlogPosition(element); + splitState.asBinlogSplitState().setStartingOffset(position); + } nextRecord = element; return true; } else { diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/postgres/PostgresSourceReader.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/postgres/PostgresSourceReader.java new file mode 100644 index 00000000000000..3e94700f2b464f --- /dev/null +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/source/reader/postgres/PostgresSourceReader.java @@ -0,0 +1,388 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cdcclient.source.reader.postgres; + +import org.apache.doris.cdcclient.exception.CdcClientException; +import org.apache.doris.cdcclient.source.factory.DataSource; +import org.apache.doris.cdcclient.source.reader.JdbcIncrementalSourceReader; +import org.apache.doris.cdcclient.utils.ConfigUtil; +import org.apache.doris.job.cdc.DataSourceConfigKeys; +import org.apache.doris.job.cdc.request.CompareOffsetRequest; +import org.apache.doris.job.cdc.request.JobBaseConfig; + +import org.apache.commons.lang3.StringUtils; +import org.apache.flink.api.connector.source.SourceSplit; +import org.apache.flink.cdc.connectors.base.config.JdbcSourceConfig; +import org.apache.flink.cdc.connectors.base.dialect.JdbcDataSourceDialect; +import org.apache.flink.cdc.connectors.base.options.StartupOptions; +import org.apache.flink.cdc.connectors.base.source.meta.offset.Offset; +import org.apache.flink.cdc.connectors.base.source.meta.offset.OffsetFactory; +import org.apache.flink.cdc.connectors.base.source.meta.split.SourceSplitBase; +import org.apache.flink.cdc.connectors.base.source.meta.split.StreamSplit; +import org.apache.flink.cdc.connectors.base.source.reader.external.FetchTask; +import org.apache.flink.cdc.connectors.base.source.reader.external.IncrementalSourceScanFetcher; +import org.apache.flink.cdc.connectors.base.source.reader.external.IncrementalSourceStreamFetcher; +import org.apache.flink.cdc.connectors.postgres.source.PostgresDialect; +import org.apache.flink.cdc.connectors.postgres.source.config.PostgresSourceConfig; +import org.apache.flink.cdc.connectors.postgres.source.config.PostgresSourceConfigFactory; +import org.apache.flink.cdc.connectors.postgres.source.fetch.PostgresSourceFetchTaskContext; +import org.apache.flink.cdc.connectors.postgres.source.fetch.PostgresStreamFetchTask; +import org.apache.flink.cdc.connectors.postgres.source.offset.PostgresOffset; +import org.apache.flink.cdc.connectors.postgres.source.offset.PostgresOffsetFactory; +import org.apache.flink.cdc.connectors.postgres.source.utils.CustomPostgresSchema; +import org.apache.flink.cdc.connectors.postgres.source.utils.PostgresTypeUtils; +import org.apache.flink.cdc.connectors.postgres.source.utils.TableDiscoveryUtils; +import org.apache.flink.table.types.DataType; + +import java.time.Instant; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Preconditions; +import io.debezium.connector.postgresql.SourceInfo; +import io.debezium.connector.postgresql.connection.PostgresConnection; +import io.debezium.connector.postgresql.connection.PostgresReplicationConnection; +import io.debezium.connector.postgresql.spi.SlotState; +import io.debezium.jdbc.JdbcConnection; +import io.debezium.relational.Column; +import io.debezium.relational.TableId; +import io.debezium.relational.history.TableChanges; +import io.debezium.time.Conversions; +import lombok.Data; +import org.postgresql.Driver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Data +public class PostgresSourceReader extends JdbcIncrementalSourceReader { + private static final Logger LOG = LoggerFactory.getLogger(PostgresSourceReader.class); + private static final ObjectMapper objectMapper = new ObjectMapper(); + + public PostgresSourceReader() { + super(); + } + + @Override + public void initialize(long jobId, DataSource dataSource, Map config) { + PostgresSourceConfig sourceConfig = generatePostgresConfig(config, jobId); + PostgresDialect dialect = new PostgresDialect(sourceConfig); + LOG.info("Creating slot for job {}, user {}", jobId, sourceConfig.getUsername()); + createSlotForGlobalStreamSplit(dialect); + super.initialize(jobId, dataSource, config); + } + + /** + * copy from org.apache.flink.cdc.connectors.postgres.source + * .enumerator.PostgresSourceEnumerator.createSlotForGlobalStreamSplit + * + *

Create slot for the unique global stream split. + * + *

Currently all startup modes need read the stream split. We need open the slot before + * reading the globalStreamSplit to catch all data changes. + */ + private void createSlotForGlobalStreamSplit(PostgresDialect postgresDialect) { + try (PostgresConnection connection = postgresDialect.openJdbcConnection()) { + SlotState slotInfo = + connection.getReplicationSlotState( + postgresDialect.getSlotName(), postgresDialect.getPluginName()); + // skip creating the replication slot when the slot exists. + if (slotInfo != null) { + return; + } + PostgresReplicationConnection replicationConnection = + postgresDialect.openPostgresReplicationConnection(connection); + replicationConnection.createReplicationSlot(); + replicationConnection.close(false); + + } catch (Throwable t) { + throw new CdcClientException( + String.format( + "Fail to get or create slot for global stream split, the slot name is %s. Due to: ", + postgresDialect.getSlotName()), + t); + } + } + + @Override + protected PostgresSourceConfig getSourceConfig(JobBaseConfig config) { + return generatePostgresConfig(config); + } + + /** Generate PostgreSQL source config from JobBaseConfig */ + private PostgresSourceConfig generatePostgresConfig(JobBaseConfig config) { + return generatePostgresConfig(config.getConfig(), config.getJobId()); + } + + /** Generate PostgreSQL source config from Map config */ + private PostgresSourceConfig generatePostgresConfig(Map cdcConfig, Long jobId) { + PostgresSourceConfigFactory configFactory = new PostgresSourceConfigFactory(); + + // Parse JDBC URL to extract connection info + String jdbcUrl = cdcConfig.get(DataSourceConfigKeys.JDBC_URL); + Preconditions.checkNotNull(jdbcUrl, "jdbc_url is required"); + + // PostgreSQL JDBC URL format: jdbc:postgresql://host:port/database + Properties props = Driver.parseURL(jdbcUrl, null); + Preconditions.checkNotNull(props, "Invalid JDBC URL: " + jdbcUrl); + + String hostname = props.getProperty("PGHOST"); + String port = props.getProperty("PGPORT"); + String database = props.getProperty("PGDBNAME"); + Preconditions.checkNotNull(hostname, "host is required"); + Preconditions.checkNotNull(port, "port is required"); + Preconditions.checkNotNull(database, "database is required"); + + configFactory.hostname(hostname); + configFactory.port(Integer.parseInt(port)); + configFactory.username(cdcConfig.get(DataSourceConfigKeys.USER)); + configFactory.password(cdcConfig.get(DataSourceConfigKeys.PASSWORD)); + configFactory.database(database); + + String schema = cdcConfig.get(DataSourceConfigKeys.SCHEMA); + Preconditions.checkNotNull(schema, "schema is required"); + configFactory.schemaList(new String[] {schema}); + configFactory.includeSchemaChanges(false); + + // Set table list + String includingTables = cdcConfig.get(DataSourceConfigKeys.INCLUDE_TABLES); + if (StringUtils.isNotEmpty(includingTables)) { + String[] includingTbls = + Arrays.stream(includingTables.split(",")) + .map(t -> schema + "." + t.trim()) + .toArray(String[]::new); + configFactory.tableList(includingTbls); + } + + // Set startup options + String startupMode = cdcConfig.get(DataSourceConfigKeys.OFFSET); + if (DataSourceConfigKeys.OFFSET_INITIAL.equalsIgnoreCase(startupMode)) { + configFactory.startupOptions(StartupOptions.initial()); + } else if (DataSourceConfigKeys.OFFSET_EARLIEST.equalsIgnoreCase(startupMode)) { + configFactory.startupOptions(StartupOptions.earliest()); + } else if (DataSourceConfigKeys.OFFSET_LATEST.equalsIgnoreCase(startupMode)) { + configFactory.startupOptions(StartupOptions.latest()); + } else if (ConfigUtil.isJson(startupMode)) { + throw new RuntimeException("Unsupported json offset " + startupMode); + } else if (ConfigUtil.is13Timestamp(startupMode)) { + // start from timestamp + Long ts = Long.parseLong(startupMode); + configFactory.startupOptions(StartupOptions.timestamp(ts)); + } else { + throw new RuntimeException("Unknown offset " + startupMode); + } + + // Set split size if provided + if (cdcConfig.containsKey(DataSourceConfigKeys.SPLIT_SIZE)) { + configFactory.splitSize( + Integer.parseInt(cdcConfig.get(DataSourceConfigKeys.SPLIT_SIZE))); + } + + Properties dbzProps = new Properties(); + dbzProps.put("interval.handling.mode", "string"); + configFactory.debeziumProperties(dbzProps); + + configFactory.serverTimeZone( + ConfigUtil.getPostgresServerTimeZoneFromProps(props).toString()); + configFactory.slotName(getSlotName(jobId)); + configFactory.decodingPluginName("pgoutput"); + // configFactory.heartbeatInterval(Duration.ofMillis(Constants.POLL_SPLIT_RECORDS_TIMEOUTS)); + return configFactory.create(0); + } + + private String getSlotName(Long jobId) { + return "doris_cdc_" + jobId; + } + + @Override + protected IncrementalSourceScanFetcher getSnapshotSplitReader(JobBaseConfig config) { + PostgresSourceConfig sourceConfig = getSourceConfig(config); + IncrementalSourceScanFetcher snapshotReader = this.getSnapshotReader(); + if (snapshotReader == null) { + PostgresDialect dialect = new PostgresDialect(sourceConfig); + PostgresSourceFetchTaskContext taskContext = + new PostgresSourceFetchTaskContext(sourceConfig, dialect); + snapshotReader = new IncrementalSourceScanFetcher(taskContext, 0); + this.setSnapshotReader(snapshotReader); + } + return snapshotReader; + } + + @Override + protected IncrementalSourceStreamFetcher getBinlogSplitReader(JobBaseConfig config) { + PostgresSourceConfig sourceConfig = getSourceConfig(config); + IncrementalSourceStreamFetcher binlogReader = this.getBinlogReader(); + if (binlogReader == null) { + PostgresDialect dialect = new PostgresDialect(sourceConfig); + PostgresSourceFetchTaskContext taskContext = + new PostgresSourceFetchTaskContext(sourceConfig, dialect); + binlogReader = new IncrementalSourceStreamFetcher(taskContext, 0); + this.setBinlogReader(binlogReader); + } + return binlogReader; + } + + @Override + protected OffsetFactory getOffsetFactory() { + return new PostgresOffsetFactory(); + } + + @Override + protected Offset createOffset(Map offset) { + return PostgresOffset.of(offset); + } + + @Override + protected Offset createInitialOffset() { + return PostgresOffset.INITIAL_OFFSET; + } + + @Override + protected Offset createNoStoppingOffset() { + return PostgresOffset.NO_STOPPING_OFFSET; + } + + @Override + protected JdbcDataSourceDialect getDialect(JdbcSourceConfig sourceConfig) { + return new PostgresDialect((PostgresSourceConfig) sourceConfig); + } + + @Override + protected DataType fromDbzColumn(Column splitColumn) { + return PostgresTypeUtils.fromDbzColumn(splitColumn); + } + + /** + * Why not call dialect.displayCurrentOffset(sourceConfig) ? The underlying system calls + * `txid_current()` to advance the WAL log. Here, it's just a query; retrieving the LSN is + * sufficient because `PostgresOffset.compare` only compares the LSN. + */ + @Override + public Map getEndOffset(JobBaseConfig jobConfig) { + PostgresSourceConfig sourceConfig = getSourceConfig(jobConfig); + try { + PostgresDialect dialect = new PostgresDialect(sourceConfig); + try (JdbcConnection jdbcConnection = dialect.openJdbcConnection(sourceConfig)) { + PostgresConnection pgConnection = (PostgresConnection) jdbcConnection; + Long lsn = pgConnection.currentXLogLocation(); + Map offsetMap = new HashMap<>(); + offsetMap.put(SourceInfo.LSN_KEY, lsn.toString()); + offsetMap.put( + SourceInfo.TIMESTAMP_USEC_KEY, + String.valueOf(Conversions.toEpochMicros(Instant.MIN))); + return offsetMap; + } + } catch (Exception ex) { + throw new RuntimeException(ex); + } + } + + @Override + public int compareOffset(CompareOffsetRequest compareOffsetRequest) { + Map offsetFirst = compareOffsetRequest.getOffsetFirst(); + Map offsetSecond = compareOffsetRequest.getOffsetSecond(); + + PostgresOffset postgresOffset1 = PostgresOffset.of(offsetFirst); + PostgresOffset postgresOffset2 = PostgresOffset.of(offsetSecond); + return postgresOffset1.compareTo(postgresOffset2); + } + + @Override + protected Map discoverTableSchemas(JobBaseConfig config) { + PostgresSourceConfig sourceConfig = getSourceConfig(config); + try { + PostgresDialect dialect = new PostgresDialect(sourceConfig); + try (JdbcConnection jdbcConnection = dialect.openJdbcConnection(sourceConfig)) { + List tableIds = + TableDiscoveryUtils.listTables( + sourceConfig.getDatabaseList().get(0), + jdbcConnection, + sourceConfig.getTableFilters(), + sourceConfig.includePartitionedTables()); + CustomPostgresSchema customPostgresSchema = + new CustomPostgresSchema((PostgresConnection) jdbcConnection, sourceConfig); + return customPostgresSchema.getTableSchema(tableIds); + } + } catch (Exception ex) { + throw new RuntimeException(ex); + } + } + + @Override + protected FetchTask createFetchTaskFromSplit( + JobBaseConfig jobConfig, SourceSplitBase split) { + PostgresSourceConfig sourceConfig = getSourceConfig(jobConfig); + PostgresDialect dialect = new PostgresDialect(sourceConfig); + FetchTask fetchTask = dialect.createFetchTask(split); + return fetchTask; + } + + /** + * This method commits values up to the startOffset of the current split; even if + * `CommitFeOffset` fails, Data after the startOffset will not be cleared. + */ + @Override + public void commitSourceOffset(Long jobId, SourceSplit sourceSplit) { + try { + if (sourceSplit instanceof StreamSplit) { + Offset offsetToCommit = ((StreamSplit) sourceSplit).getStartingOffset(); + if (getCurrentFetchTask() != null + && getCurrentFetchTask() instanceof PostgresStreamFetchTask) { + ((PostgresStreamFetchTask) getCurrentFetchTask()) + .commitCurrentOffset(offsetToCommit); + LOG.info( + "Committing job {} postgres offset {} for {}", + jobId, + offsetToCommit, + getCurrentFetchTask().getSplit()); + } + } + } catch (Exception e) { + LOG.warn( + "Failed to commit {} postgres offset for split {}: {}", + jobId, + sourceSplit, + e.getMessage(), + e); + } + } + + @Override + public void close(JobBaseConfig jobConfig) { + super.close(jobConfig); + // drop pg slot + try { + PostgresSourceConfig sourceConfig = getSourceConfig(jobConfig); + PostgresDialect dialect = new PostgresDialect(sourceConfig); + String slotName = getSlotName(jobConfig.getJobId()); + LOG.info( + "Dropping postgres replication slot {} for job {}", + slotName, + jobConfig.getJobId()); + dialect.removeSlot(slotName); + } catch (Exception ex) { + LOG.warn( + "Failed to drop postgres replication slot for job {}: {}", + jobConfig.getJobId(), + ex.getMessage()); + } + } +} diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/utils/ConfigUtil.java b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/utils/ConfigUtil.java index 6cf84a5340d709..b8503adf7b5302 100644 --- a/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/utils/ConfigUtil.java +++ b/fs_brokers/cdc_client/src/main/java/org/apache/doris/cdcclient/utils/ConfigUtil.java @@ -17,24 +17,10 @@ package org.apache.doris.cdcclient.utils; -import org.apache.doris.job.cdc.DataSourceConfigKeys; -import org.apache.doris.job.cdc.request.JobBaseConfig; - -import org.apache.commons.collections.MapUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.flink.cdc.connectors.mysql.debezium.DebeziumUtils; -import org.apache.flink.cdc.connectors.mysql.source.config.MySqlSourceConfig; -import org.apache.flink.cdc.connectors.mysql.source.config.MySqlSourceConfigFactory; -import org.apache.flink.cdc.connectors.mysql.source.offset.BinlogOffset; -import org.apache.flink.cdc.connectors.mysql.source.offset.BinlogOffsetUtils; -import org.apache.flink.cdc.connectors.mysql.table.StartupOptions; -import java.sql.SQLException; import java.time.ZoneId; -import java.util.Arrays; import java.util.Map; -import java.util.Properties; -import java.util.stream.Collectors; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.type.TypeReference; @@ -42,118 +28,64 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Preconditions; import com.mysql.cj.conf.ConnectionUrl; -import io.debezium.connector.mysql.MySqlConnection; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class ConfigUtil { - private static final ObjectMapper mapper = new ObjectMapper(); + private static ObjectMapper objectMapper = new ObjectMapper(); + private static final Logger LOG = LoggerFactory.getLogger(ConfigUtil.class); public static String getServerId(long jobId) { return String.valueOf(Math.abs(String.valueOf(jobId).hashCode())); } - public static MySqlSourceConfig generateMySqlConfig(JobBaseConfig config) { - return generateMySqlConfig(config.getConfig(), getServerId(config.getJobId())); - } - - public static MySqlSourceConfig generateMySqlConfig(Map config) { - return generateMySqlConfig(config, "0"); + public static ZoneId getServerTimeZoneFromJdbcUrl(String jdbcUrl) { + if (jdbcUrl == null) { + return ZoneId.systemDefault(); + } + if (jdbcUrl.startsWith("jdbc:mysql://") || jdbcUrl.startsWith("jdbc:mariadb://")) { + return getServerTimeZone(jdbcUrl); + } else if (jdbcUrl.startsWith("jdbc:postgresql://")) { + return getPostgresServerTimeZone(jdbcUrl); + } + return ZoneId.systemDefault(); } - public static ZoneId getServerTimeZone(String jdbcUrl) { + private static ZoneId getServerTimeZone(String jdbcUrl) { Preconditions.checkNotNull(jdbcUrl, "jdbcUrl is null"); ConnectionUrl cu = ConnectionUrl.getConnectionUrlInstance(jdbcUrl, null); return getTimeZoneFromProps(cu.getOriginalProperties()); } - private static MySqlSourceConfig generateMySqlConfig( - Map cdcConfig, String serverId) { - MySqlSourceConfigFactory configFactory = new MySqlSourceConfigFactory(); - ConnectionUrl cu = - ConnectionUrl.getConnectionUrlInstance( - cdcConfig.get(DataSourceConfigKeys.JDBC_URL), null); - configFactory.hostname(cu.getMainHost().getHost()); - configFactory.port(cu.getMainHost().getPort()); - configFactory.username(cdcConfig.get(DataSourceConfigKeys.USER)); - configFactory.password(cdcConfig.get(DataSourceConfigKeys.PASSWORD)); - String databaseName = cdcConfig.get(DataSourceConfigKeys.DATABASE); - configFactory.databaseList(databaseName); - configFactory.serverId(serverId); - configFactory.serverTimeZone(getTimeZoneFromProps(cu.getOriginalProperties()).toString()); - - configFactory.includeSchemaChanges(false); - - String includingTables = cdcConfig.get(DataSourceConfigKeys.INCLUDE_TABLES); - String[] includingTbls = - Arrays.stream(includingTables.split(",")) - .map(t -> databaseName + "." + t.trim()) - .toArray(String[]::new); - configFactory.tableList(includingTbls); - - String excludingTables = cdcConfig.get(DataSourceConfigKeys.EXCLUDE_TABLES); - if (StringUtils.isNotEmpty(excludingTables)) { - String excludingTbls = - Arrays.stream(excludingTables.split(",")) - .map(t -> databaseName + "." + t.trim()) - .collect(Collectors.joining(",")); - configFactory.excludeTableList(excludingTbls); - } - - // setting startMode - String startupMode = cdcConfig.get(DataSourceConfigKeys.OFFSET); - if (DataSourceConfigKeys.OFFSET_INITIAL.equalsIgnoreCase(startupMode)) { - // do not need set offset when initial - // configFactory.startupOptions(StartupOptions.initial()); - } else if (DataSourceConfigKeys.OFFSET_EARLIEST.equalsIgnoreCase(startupMode)) { - configFactory.startupOptions(StartupOptions.earliest()); - BinlogOffset binlogOffset = - initializeEffectiveOffset( - configFactory, StartupOptions.earliest().binlogOffset); - configFactory.startupOptions(StartupOptions.specificOffset(binlogOffset)); - } else if (DataSourceConfigKeys.OFFSET_LATEST.equalsIgnoreCase(startupMode)) { - configFactory.startupOptions(StartupOptions.latest()); - BinlogOffset binlogOffset = - initializeEffectiveOffset(configFactory, StartupOptions.latest().binlogOffset); - configFactory.startupOptions(StartupOptions.specificOffset(binlogOffset)); - } else if (isJson(startupMode)) { - // start from specific offset - Map offsetMap = toStringMap(startupMode); - if (MapUtils.isEmpty(offsetMap)) { - throw new RuntimeException("Incorrect offset " + startupMode); - } - if (offsetMap.containsKey(BinlogOffset.BINLOG_FILENAME_OFFSET_KEY) - && offsetMap.containsKey(BinlogOffset.BINLOG_POSITION_OFFSET_KEY)) { - BinlogOffset binlogOffset = new BinlogOffset(offsetMap); - configFactory.startupOptions(StartupOptions.specificOffset(binlogOffset)); - } else { - throw new RuntimeException("Incorrect offset " + startupMode); + public static ZoneId getTimeZoneFromProps(Map originalProperties) { + if (originalProperties != null && originalProperties.containsKey("serverTimezone")) { + String timeZone = originalProperties.get("serverTimezone"); + if (StringUtils.isNotEmpty(timeZone)) { + return ZoneId.of(timeZone); } - } else if (is13Timestamp(startupMode)) { - // start from timestamp - Long ts = Long.parseLong(startupMode); - BinlogOffset binlogOffset = - initializeEffectiveOffset( - configFactory, StartupOptions.timestamp(ts).binlogOffset); - configFactory.startupOptions(StartupOptions.specificOffset(binlogOffset)); - } else { - throw new RuntimeException("Unknown offset " + startupMode); } + return ZoneId.systemDefault(); + } - Properties jdbcProperteis = new Properties(); - jdbcProperteis.putAll(cu.getOriginalProperties()); - configFactory.jdbcProperties(jdbcProperteis); - - // configFactory.heartbeatInterval(Duration.ofMillis(1)); - if (cdcConfig.containsKey(DataSourceConfigKeys.SPLIT_SIZE)) { - configFactory.splitSize( - Integer.parseInt(cdcConfig.get(DataSourceConfigKeys.SPLIT_SIZE))); + public static ZoneId getPostgresServerTimeZone(String jdbcUrl) { + Preconditions.checkNotNull(jdbcUrl, "jdbcUrl is null"); + try { + java.util.Properties props = org.postgresql.Driver.parseURL(jdbcUrl, null); + if (props != null && props.containsKey("timezone")) { + String timeZone = props.getProperty("timezone"); + if (StringUtils.isNotEmpty(timeZone)) { + return ZoneId.of(timeZone); + } + } + } catch (Exception e) { + LOG.warn("Failed to parse Postgres JDBC URL for timezone: {}", jdbcUrl); } - - return configFactory.createConfig(0); + return ZoneId.systemDefault(); } - private static ZoneId getTimeZoneFromProps(Map originalProperties) { - if (originalProperties != null && originalProperties.containsKey("serverTimezone")) { - String timeZone = originalProperties.get("serverTimezone"); + public static ZoneId getPostgresServerTimeZoneFromProps(java.util.Properties props) { + if (props != null && props.containsKey("timezone")) { + String timeZone = props.getProperty("timezone"); if (StringUtils.isNotEmpty(timeZone)) { return ZoneId.of(timeZone); } @@ -161,39 +93,29 @@ private static ZoneId getTimeZoneFromProps(Map originalPropertie return ZoneId.systemDefault(); } - private static BinlogOffset initializeEffectiveOffset( - MySqlSourceConfigFactory configFactory, BinlogOffset binlogOffset) { - MySqlSourceConfig config = configFactory.createConfig(0); - try (MySqlConnection connection = DebeziumUtils.createMySqlConnection(config)) { - return BinlogOffsetUtils.initializeEffectiveOffset(binlogOffset, connection, config); - } catch (SQLException e) { - throw new RuntimeException(e); - } - } - - private static boolean is13Timestamp(String s) { + public static boolean is13Timestamp(String s) { return s != null && s.matches("\\d{13}"); } - private static boolean isJson(String str) { + public static boolean isJson(String str) { if (str == null || str.trim().isEmpty()) { return false; } try { - JsonNode node = mapper.readTree(str); + JsonNode node = objectMapper.readTree(str); return node.isObject(); } catch (Exception e) { return false; } } - private static Map toStringMap(String json) { + public static Map toStringMap(String json) { if (!isJson(json)) { return null; } try { - return mapper.readValue(json, new TypeReference>() {}); + return objectMapper.readValue(json, new TypeReference>() {}); } catch (JsonProcessingException e) { return null; } diff --git a/fs_brokers/cdc_client/src/main/java/org/apache/flink/cdc/connectors/postgres/source/PostgresConnectionPoolFactory.java b/fs_brokers/cdc_client/src/main/java/org/apache/flink/cdc/connectors/postgres/source/PostgresConnectionPoolFactory.java new file mode 100644 index 00000000000000..503473a68ba372 --- /dev/null +++ b/fs_brokers/cdc_client/src/main/java/org/apache/flink/cdc/connectors/postgres/source/PostgresConnectionPoolFactory.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.postgres.source; + +import org.apache.flink.cdc.connectors.base.config.JdbcSourceConfig; +import org.apache.flink.cdc.connectors.base.relational.connection.ConnectionPoolId; +import org.apache.flink.cdc.connectors.base.relational.connection.JdbcConnectionPoolFactory; + +import com.zaxxer.hikari.HikariDataSource; +import io.debezium.jdbc.JdbcConfiguration; + +/** + * A connection pool factory to create pooled Postgres {@link HikariDataSource}. Copy from + * https://github.com/apache/flink-cdc/blob/release-3.5/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-postgres-cdc/src/main/java/org/apache/flink/cdc/connectors/postgres/source/PostgresConnectionPoolFactory.java + * Line 54: to use config.getUser() instead of config.getHostname() as username to fix FLINK-38847 + */ +public class PostgresConnectionPoolFactory extends JdbcConnectionPoolFactory { + public static final String JDBC_URL_PATTERN = "jdbc:postgresql://%s:%s/%s"; + + @Override + public String getJdbcUrl(JdbcSourceConfig sourceConfig) { + + String hostName = sourceConfig.getHostname(); + int port = sourceConfig.getPort(); + String database = sourceConfig.getDatabaseList().get(0); + return String.format(JDBC_URL_PATTERN, hostName, port, database); + } + + /** + * The reuses of connection pools are based on databases in postgresql. Different databases in + * same instance cannot reuse same connection pool to connect. + */ + @Override + public ConnectionPoolId getPoolId( + JdbcConfiguration config, String dataSourcePoolFactoryIdentifier) { + return new ConnectionPoolId( + config.getHostname(), + config.getPort(), + config.getUser(), + config.getDatabase(), + dataSourcePoolFactoryIdentifier); + } +} diff --git a/regression-test/data/external_table_p0/jdbc/test_pg_jdbc_catalog.out b/regression-test/data/external_table_p0/jdbc/test_pg_jdbc_catalog.out index f92fb5d694a67b..61aa3523d36681 100644 --- a/regression-test/data/external_table_p0/jdbc/test_pg_jdbc_catalog.out +++ b/regression-test/data/external_table_p0/jdbc/test_pg_jdbc_catalog.out @@ -1,6 +1,7 @@ -- This file is automatically generated. You should know what you did if you want to edit this -- !show_db -- catalog_pg_test +cdc_test doris_test information_schema mysql @@ -2296,6 +2297,7 @@ mysql -- !specified_database_3 -- catalog_pg_test +cdc_test information_schema mysql pg_catalog diff --git a/regression-test/data/job_p0/streaming_job/cdc/test_streaming_mysql_job_exclude.out b/regression-test/data/job_p0/streaming_job/cdc/test_streaming_mysql_job_exclude.out index 4a0130d4b60dad..ce258ec474511f 100644 --- a/regression-test/data/job_p0/streaming_job/cdc/test_streaming_mysql_job_exclude.out +++ b/regression-test/data/job_p0/streaming_job/cdc/test_streaming_mysql_job_exclude.out @@ -1,5 +1,17 @@ -- This file is automatically generated. You should know what you did if you want to edit this --- !select -- +-- !select_table1 -- A1 1 B1 2 +-- !select_table2 -- +A2 1 +B2 2 + +-- !select_table1 -- +A1 1 +B1 2 + +-- !select_table2 -- +A2 1 +B2 2 + diff --git a/regression-test/data/job_p0/streaming_job/cdc/test_streaming_postgres_job.out b/regression-test/data/job_p0/streaming_job/cdc/test_streaming_postgres_job.out new file mode 100644 index 00000000000000..aebbb6815e3952 --- /dev/null +++ b/regression-test/data/job_p0/streaming_job/cdc/test_streaming_postgres_job.out @@ -0,0 +1,18 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_snapshot_table1 -- +A1 1 +B1 2 + +-- !select_snapshot_table2 -- +A2 1 +B2 2 + +-- !select_binlog_table1 -- +B1 10 +Doris 18 + +-- !select_next_binlog_table1 -- +Apache 40 +B1 10 +Doris 18 + diff --git a/regression-test/data/job_p0/streaming_job/cdc/test_streaming_postgres_job_all_type.out b/regression-test/data/job_p0/streaming_job/cdc/test_streaming_postgres_job_all_type.out new file mode 100644 index 00000000000000..1efdc563a1474b --- /dev/null +++ b/regression-test/data/job_p0/streaming_job/cdc/test_streaming_postgres_job_all_type.out @@ -0,0 +1,39 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !desc_all_types_null -- +id bigint No true \N +smallint_col smallint Yes false \N NONE +integer_col int Yes false \N NONE +bigint_col bigint Yes false \N NONE +real_col float Yes false \N NONE +double_col double Yes false \N NONE +numeric_col decimal(20,6) Yes false \N NONE +char_col char(10) Yes false \N NONE +varchar_col text Yes false \N NONE +text_col text Yes false \N NONE +boolean_col boolean Yes false \N NONE +date_col date Yes false \N NONE +time_col text Yes false \N NONE +timetz_col text Yes false \N NONE +timestamp_col datetime(6) Yes false \N NONE +timestamptz_col datetime(6) Yes false \N NONE +interval_col text Yes false \N NONE +bytea_col text Yes false \N NONE +uuid_col text Yes false \N NONE +json_col text Yes false \N NONE +jsonb_col text Yes false \N NONE +inet_col text Yes false \N NONE +cidr_col text Yes false \N NONE +macaddr_col text Yes false \N NONE +bit_col text Yes false \N NONE +bit_varying_col text Yes false \N NONE +int_array_col array Yes false \N NONE +text_array_col array Yes false \N NONE +point_col text Yes false \N NONE + +-- !select_all_types_null -- +1 1 100 1000 1.23 4.56 12345.678901 char varchar text value true 2024-01-01 12:00 12:00:00Z 2024-01-01T12:00 2024-01-01T04:00 P0Y0M1DT0H0M0S 3q2+7w== 11111111-2222-3333-4444-555555555555 {"a":1} {"b": 2} 192.168.1.1 192.168.0.0/24 08:00:2b:01:02:03 qg== Cg== [1, 2, 3] ["a", "b", "c"] {"coordinates":[1,2],"type":"Point","srid":0} + +-- !select_all_types_null2 -- +1 1 100 1000 1.23 4.56 12345.678901 char varchar text value true 2024-01-01 12:00 12:00:00Z 2024-01-01T12:00 2024-01-01T04:00 P0Y0M1DT0H0M0S 3q2+7w== 11111111-2222-3333-4444-555555555555 {"a":1} {"b": 2} 192.168.1.1 192.168.0.0/24 08:00:2b:01:02:03 qg== Cg== [1, 2, 3] ["a", "b", "c"] {"coordinates":[1,2],"type":"Point","srid":0} +2 2 200 2000 7.89 0.12 99999.000001 char2 varchar2 another text false 2025-01-01 23:59:59 23:59:59Z 2025-01-01T23:59:59 2025-01-01T23:59:59 P0Y0M0DT2H0M0S 3q2+7w== 11111111-2222-3333-4444-555555555556 {"x":10} {"y": 20} 10.0.0.1 10.0.0.0/16 08:00:2b:aa:bb:cc 8A== Dw== [10, 20] ["x", "y"] {"coordinates":[3,4],"type":"Point","srid":0} + diff --git a/regression-test/data/job_p0/streaming_job/cdc/test_streaming_postgres_job_priv.out b/regression-test/data/job_p0/streaming_job/cdc/test_streaming_postgres_job_priv.out new file mode 100644 index 00000000000000..bd7e1c50d6c1f0 --- /dev/null +++ b/regression-test/data/job_p0/streaming_job/cdc/test_streaming_postgres_job_priv.out @@ -0,0 +1,4 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +Doris 18 + diff --git a/regression-test/framework/pom.xml b/regression-test/framework/pom.xml index a080d2a130452f..a3db3f4b09b41a 100644 --- a/regression-test/framework/pom.xml +++ b/regression-test/framework/pom.xml @@ -216,6 +216,11 @@ under the License. mysql-connector-java 8.0.28 + + org.postgresql + postgresql + 42.7.3 + commons-cli commons-cli diff --git a/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_mysql_job_exclude.groovy b/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_mysql_job_exclude.groovy index bd6240269d19a0..27553e0c0a525e 100644 --- a/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_mysql_job_exclude.groovy +++ b/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_mysql_job_exclude.groovy @@ -25,7 +25,7 @@ suite("test_streaming_mysql_job_exclude", "p0,external,mysql,external_docker,ext def currentDb = (sql "select database()")[0][0] def table1 = "user_info_exclude1" def table2 = "user_info_exclude2" - def mysqlDb = "test_cdc_db" + def mysqlDb = "test_cdc_exclude_db" sql """DROP JOB IF EXISTS where jobname = '${jobName}'""" sql """drop table if exists ${currentDb}.${table1} force""" @@ -61,7 +61,7 @@ suite("test_streaming_mysql_job_exclude", "p0,external,mysql,external_docker,ext sql """INSERT INTO ${mysqlDb}.${table2} (name, age) VALUES ('B2', 2);""" } - // exclude_table + //case1: When both include_tables and exclude_table are specified, use include_tables. sql """CREATE JOB ${jobName} ON STREAMING FROM MYSQL ( @@ -105,12 +105,108 @@ suite("test_streaming_mysql_job_exclude", "p0,external,mysql,external_docker,ext } // check snapshot data - qt_select """ SELECT * FROM ${table1} order by name asc """ + qt_select_table1 """ SELECT * FROM ${table1} order by name asc """ + + + //case2: Specify exclude_table, but do not specify include_tables + sql """DROP JOB IF EXISTS where jobname = '${jobName}'""" + sql """drop table if exists ${currentDb}.${table1} force""" + sql """drop table if exists ${currentDb}.${table2} force""" + sql """CREATE JOB ${jobName} + ON STREAMING + FROM MYSQL ( + "jdbc_url" = "jdbc:mysql://${externalEnvIp}:${mysql_port}", + "driver_url" = "${driver_url}", + "driver_class" = "com.mysql.cj.jdbc.Driver", + "user" = "root", + "password" = "123456", + "database" = "${mysqlDb}", + "exclude_tables" = "${table1}", + "offset" = "initial" + ) + TO DATABASE ${currentDb} ( + "table.create.properties.replication_num" = "1" + ) + """ + // check table created + def showTablesCase2 = sql """ show tables from ${currentDb} like '${table1}'; """ + assert showTablesCase2.size() == 0 + def showTables2Case2 = sql """ show tables from ${currentDb} like '${table2}'; """ + assert showTables2Case2.size() == 1 + + // check job running + try { + Awaitility.await().atMost(300, SECONDS) + .pollInterval(1, SECONDS).until( + { + def jobSuccendCount = sql """ select SucceedTaskCount from jobs("type"="insert") where Name = '${jobName}' and ExecuteType='STREAMING' """ + log.info("jobSuccendCount: " + jobSuccendCount) + // check job status and succeed task count larger than 1 + jobSuccendCount.size() == 1 && '1' <= jobSuccendCount.get(0).get(0) + } + ) + } catch (Exception ex){ + def showjob = sql """select * from jobs("type"="insert") where Name='${jobName}'""" + def showtask = sql """select * from tasks("type"="insert") where JobName='${jobName}'""" + log.info("show job: " + showjob) + log.info("show task: " + showtask) + throw ex; + } + + // check snapshot data + qt_select_table2 """ SELECT * FROM ${table2} order by name asc """ + + //case3: Do not specify either exclude_table or include_tables + sql """DROP JOB IF EXISTS where jobname = '${jobName}'""" + sql """drop table if exists ${currentDb}.${table1} force""" + sql """drop table if exists ${currentDb}.${table2} force""" + sql """CREATE JOB ${jobName} + ON STREAMING + FROM MYSQL ( + "jdbc_url" = "jdbc:mysql://${externalEnvIp}:${mysql_port}", + "driver_url" = "${driver_url}", + "driver_class" = "com.mysql.cj.jdbc.Driver", + "user" = "root", + "password" = "123456", + "database" = "${mysqlDb}", + "offset" = "initial" + ) + TO DATABASE ${currentDb} ( + "table.create.properties.replication_num" = "1" + ) + """ + // check table created + def showTablesCase3 = sql """ show tables from ${currentDb} like '${table1}'; """ + assert showTablesCase3.size() == 1 + def showTables2Case3 = sql """ show tables from ${currentDb} like '${table2}'; """ + assert showTables2Case3.size() == 1 + + // check job running + try { + Awaitility.await().atMost(300, SECONDS) + .pollInterval(1, SECONDS).until( + { + def jobSuccendCount = sql """ select SucceedTaskCount from jobs("type"="insert") where Name = '${jobName}' and ExecuteType='STREAMING' """ + log.info("jobSuccendCount: " + jobSuccendCount) + // check job status and succeed task count larger than 1 + jobSuccendCount.size() == 1 && '2' <= jobSuccendCount.get(0).get(0) + } + ) + } catch (Exception ex){ + def showjob = sql """select * from jobs("type"="insert") where Name='${jobName}'""" + def showtask = sql """select * from tasks("type"="insert") where JobName='${jobName}'""" + log.info("show job: " + showjob) + log.info("show task: " + showtask) + throw ex; + } + + // check snapshot data + qt_select_table1 """ SELECT * FROM ${table1} order by name asc """ + qt_select_table2 """ SELECT * FROM ${table2} order by name asc """ sql """ DROP JOB IF EXISTS where jobname = '${jobName}' """ - def jobCountRsp = sql """select count(1) from jobs("type"="insert") where Name ='${jobName}'""" assert jobCountRsp.get(0).get(0) == 0 } diff --git a/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_postgres_job.groovy b/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_postgres_job.groovy new file mode 100644 index 00000000000000..2a82b3a5777d09 --- /dev/null +++ b/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_postgres_job.groovy @@ -0,0 +1,171 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +import org.awaitility.Awaitility + +import static java.util.concurrent.TimeUnit.SECONDS + +suite("test_streaming_postgres_job", "p0,external,pg,external_docker,external_docker_pg") { + def jobName = "test_streaming_postgres_job_name" + def currentDb = (sql "select database()")[0][0] + def table1 = "user_info_pg_normal1" + def table2 = "user_info_pg_normal2" + def pgDB = "postgres" + def pgSchema = "cdc_test" + def pgUser = "postgres" + def pgPassword = "123456" + + sql """DROP JOB IF EXISTS where jobname = '${jobName}'""" + sql """drop table if exists ${currentDb}.${table1} force""" + sql """drop table if exists ${currentDb}.${table2} force""" + + // Pre-create table2 + sql """ + CREATE TABLE IF NOT EXISTS ${currentDb}.${table2} ( + `name` varchar(200) NULL, + `age` int NULL + ) ENGINE=OLAP + UNIQUE KEY(`name`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`name`) BUCKETS AUTO + PROPERTIES ("replication_allocation" = "tag.location.default: 1"); + """ + + String enabled = context.config.otherConfigs.get("enableJdbcTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String pg_port = context.config.otherConfigs.get("pg_14_port"); + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String s3_endpoint = getS3Endpoint() + String bucket = getS3BucketName() + String driver_url = "https://${bucket}.${s3_endpoint}/regression/jdbc_driver/postgresql-42.5.0.jar" + + // create test + connect("${pgUser}", "${pgPassword}", "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}") { + // sql """CREATE SCHEMA IF NOT EXISTS ${pgSchema}""" + sql """DROP TABLE IF EXISTS ${pgDB}.${pgSchema}.${table1}""" + sql """DROP TABLE IF EXISTS ${pgDB}.${pgSchema}.${table2}""" + sql """CREATE TABLE ${pgDB}.${pgSchema}.${table1} ( + "name" varchar(200), + "age" int2, + PRIMARY KEY ("name") + )""" + sql """INSERT INTO ${pgDB}.${pgSchema}.${table1} (name, age) VALUES ('A1', 1);""" + sql """INSERT INTO ${pgDB}.${pgSchema}.${table1} (name, age) VALUES ('B1', 2);""" + sql """CREATE TABLE ${pgDB}.${pgSchema}.${table2} ( + "name" varchar(200), + "age" int2, + PRIMARY KEY ("name") + )""" + // mock snapshot data + sql """INSERT INTO ${pgDB}.${pgSchema}.${table2} (name, age) VALUES ('A2', 1);""" + sql """INSERT INTO ${pgDB}.${pgSchema}.${table2} (name, age) VALUES ('B2', 2);""" + } + + sql """CREATE JOB ${jobName} + ON STREAMING + FROM POSTGRES ( + "jdbc_url" = "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}", + "driver_url" = "${driver_url}", + "driver_class" = "org.postgresql.Driver", + "user" = "${pgUser}", + "password" = "${pgPassword}", + "database" = "${pgDB}", + "schema" = "${pgSchema}", + "include_tables" = "${table1},${table2}", + "offset" = "initial" + ) + TO DATABASE ${currentDb} ( + "table.create.properties.replication_num" = "1" + ) + """ + def showAllTables = sql """ show tables from ${currentDb}""" + log.info("showAllTables: " + showAllTables) + // check table created + def showTables = sql """ show tables from ${currentDb} like '${table1}'; """ + assert showTables.size() == 1 + def showTables2 = sql """ show tables from ${currentDb} like '${table2}'; """ + assert showTables2.size() == 1 + + // check table schema correct + def showTbl1 = sql """show create table ${currentDb}.${table1}""" + def createTalInfo = showTbl1[0][1]; + assert createTalInfo.contains("`name` varchar(65533)"); + assert createTalInfo.contains("`age` smallint"); + assert createTalInfo.contains("UNIQUE KEY(`name`)"); + assert createTalInfo.contains("DISTRIBUTED BY HASH(`name`) BUCKETS AUTO"); + + // check job running + try { + Awaitility.await().atMost(300, SECONDS) + .pollInterval(1, SECONDS).until( + { + def jobSuccendCount = sql """ select SucceedTaskCount from jobs("type"="insert") where Name = '${jobName}' and ExecuteType='STREAMING' """ + log.info("jobSuccendCount: " + jobSuccendCount) + // check job status and succeed task count larger than 2 + jobSuccendCount.size() == 1 && '2' <= jobSuccendCount.get(0).get(0) + } + ) + } catch (Exception ex){ + def showjob = sql """select * from jobs("type"="insert") where Name='${jobName}'""" + def showtask = sql """select * from tasks("type"="insert") where JobName='${jobName}'""" + log.info("show job: " + showjob) + log.info("show task: " + showtask) + throw ex; + } + + // check snapshot data + qt_select_snapshot_table1 """ SELECT * FROM ${table1} order by name asc """ + qt_select_snapshot_table2 """ SELECT * FROM ${table2} order by name asc """ + + // mock incremental into + connect("${pgUser}", "${pgPassword}", "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}") { + sql """INSERT INTO ${pgDB}.${pgSchema}.${table1} (name,age) VALUES ('Doris',18);""" + sql """UPDATE ${pgDB}.${pgSchema}.${table1} SET age = 10 WHERE name = 'B1';""" + sql """DELETE FROM ${pgDB}.${pgSchema}.${table1} WHERE name = 'A1';""" + } + + sleep(30000); // wait for cdc incremental data + + // check incremental data + qt_select_binlog_table1 """ SELECT * FROM ${table1} order by name asc """ + + def jobInfo = sql """ + select loadStatistic, status from jobs("type"="insert") where Name='${jobName}' + """ + log.info("jobInfo: " + jobInfo) + assert jobInfo.get(0).get(0) == "{\"scannedRows\":7,\"loadBytes\":337,\"fileNumber\":0,\"fileSize\":0}" + assert jobInfo.get(0).get(1) == "RUNNING" + + // mock incremental into again + connect("${pgUser}", "${pgPassword}", "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}") { + sql """INSERT INTO ${pgDB}.${pgSchema}.${table1} (name,age) VALUES ('Apache',40);""" + } + + sleep(30000); // wait for cdc incremental data + + // check incremental data + qt_select_next_binlog_table1 """ SELECT * FROM ${table1} order by name asc """ + + sql """ + DROP JOB IF EXISTS where jobname = '${jobName}' + """ + + def jobCountRsp = sql """select count(1) from jobs("type"="insert") where Name ='${jobName}'""" + assert jobCountRsp.get(0).get(0) == 0 + } +} diff --git a/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_postgres_job_all_type.groovy b/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_postgres_job_all_type.groovy new file mode 100644 index 00000000000000..dcd688bb94fc85 --- /dev/null +++ b/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_postgres_job_all_type.groovy @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +import org.awaitility.Awaitility + +import static java.util.concurrent.TimeUnit.SECONDS + +suite("test_streaming_postgres_job_all_type", "p0,external,pg,external_docker,external_docker_pg") { + def jobName = "test_streaming_postgres_job_all_type_name" + def currentDb = (sql "select database()")[0][0] + def table1 = "streaming_all_types_nullable_with_pk_pg" + def pgDB = "postgres" + def pgSchema = "cdc_test" + def pgUser = "postgres" + def pgPassword = "123456" + + sql """DROP JOB IF EXISTS where jobname = '${jobName}'""" + sql """drop table if exists ${currentDb}.${table1} force""" + + String enabled = context.config.otherConfigs.get("enableJdbcTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String pg_port = context.config.otherConfigs.get("pg_14_port"); + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String s3_endpoint = getS3Endpoint() + String bucket = getS3BucketName() + String driver_url = "https://${bucket}.${s3_endpoint}/regression/jdbc_driver/postgresql-42.5.0.jar" + + // create test + connect("${pgUser}", "${pgPassword}", "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}") { + // sql """CREATE SCHEMA IF NOT EXISTS ${pgSchema}""" + sql """DROP TABLE IF EXISTS ${pgDB}.${pgSchema}.${table1}""" + sql """ + create table ${pgDB}.${pgSchema}.${table1} ( + id bigserial PRIMARY KEY, + smallint_col smallint, + integer_col integer, + bigint_col bigint, + real_col real, + double_col double precision, + numeric_col numeric(20,6), + char_col char(10), + varchar_col varchar(255), + text_col text, + boolean_col boolean, + date_col date, + time_col time, + timetz_col time with time zone, + timestamp_col timestamp, + timestamptz_col timestamp with time zone, + interval_col interval, + bytea_col bytea, + uuid_col uuid, + json_col json, + jsonb_col jsonb, + inet_col inet, + cidr_col cidr, + macaddr_col macaddr, + bit_col bit(8), + bit_varying_col bit varying(16), + int_array_col integer[], + text_array_col text[], + point_col point + ); + """ + // mock snapshot data + sql """ + INSERT INTO ${pgDB}.${pgSchema}.${table1} VALUES (1,1,100,1000,1.23,4.56,12345.678901,'char','varchar','text value',true,'2024-01-01','12:00:00','12:00:00+08','2024-01-01 12:00:00','2024-01-01 12:00:00+08','1 day',decode('DEADBEEF', 'hex'),'11111111-2222-3333-4444-555555555555'::uuid,'{"a":1}','{"b":2}','192.168.1.1','192.168.0.0/24','08:00:2b:01:02:03',B'10101010',B'1010',ARRAY[1,2,3],ARRAY['a','b','c'],'(1,2)'); + """ + } + + sql """CREATE JOB ${jobName} + ON STREAMING + FROM POSTGRES ( + "jdbc_url" = "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}?timezone=UTC", + "driver_url" = "${driver_url}", + "driver_class" = "org.postgresql.Driver", + "user" = "${pgUser}", + "password" = "${pgPassword}", + "database" = "${pgDB}", + "schema" = "${pgSchema}", + "include_tables" = "${table1}", + "offset" = "initial" + ) + TO DATABASE ${currentDb} ( + "table.create.properties.replication_num" = "1" + ) + """ + + // check job running + try { + Awaitility.await().atMost(300, SECONDS) + .pollInterval(1, SECONDS).until( + { + def jobSuccendCount = sql """ select SucceedTaskCount from jobs("type"="insert") where Name = '${jobName}' and ExecuteType='STREAMING' """ + log.info("jobSuccendCount: " + jobSuccendCount) + // check job status and succeed task count larger than 1 + jobSuccendCount.size() == 1 && '1' <= jobSuccendCount.get(0).get(0) + } + ) + } catch (Exception ex){ + def showjob = sql """select * from jobs("type"="insert") where Name='${jobName}'""" + def showtask = sql """select * from tasks("type"="insert") where JobName='${jobName}'""" + log.info("show job: " + showjob) + log.info("show task: " + showtask) + throw ex; + } + + qt_desc_all_types_null """desc ${currentDb}.${table1};""" + qt_select_all_types_null """select * from ${currentDb}.${table1} order by 1;""" + + // mock incremental into + connect("${pgUser}", "${pgPassword}", "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}") { + sql """INSERT INTO ${pgDB}.${pgSchema}.${table1} VALUES (2,2,200,2000,7.89,0.12,99999.000001,'char2','varchar2','another text',false,'2025-01-01','23:59:59','23:59:59+00','2025-01-01 23:59:59','2025-01-01 23:59:59+00','2 hours',decode('DEADBEEF', 'hex'),'11111111-2222-3333-4444-555555555556'::uuid,'{"x":10}','{"y":20}','10.0.0.1','10.0.0.0/16','08:00:2b:aa:bb:cc',B'11110000',B'1111',ARRAY[10,20],ARRAY['x','y'],'(3,4)');""" + } + + sleep(30000); // wait for cdc incremental data + + // check incremental data + qt_select_all_types_null2 """select * from ${currentDb}.${table1} order by 1;""" + + sql """ + DROP JOB IF EXISTS where jobname = '${jobName}' + """ + + def jobCountRsp = sql """select count(1) from jobs("type"="insert") where Name ='${jobName}'""" + assert jobCountRsp.get(0).get(0) == 0 + } +} diff --git a/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_postgres_job_dup.groovy b/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_postgres_job_dup.groovy new file mode 100644 index 00000000000000..cae745d06e6835 --- /dev/null +++ b/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_postgres_job_dup.groovy @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_streaming_postgres_job_dup", "p0,external,pg,external_docker,external_docker_pg") { + def jobName = "test_streaming_postgres_job_dup_name" + def currentDb = (sql "select database()")[0][0] + def table1 = "test_streaming_postgres_job_dup" + def pgDB = "postgres" + def pgSchema = "cdc_test" + def pgUser = "postgres" + def pgPassword = "123456" + + sql """DROP JOB IF EXISTS where jobname = '${jobName}'""" + sql """drop table if exists ${currentDb}.${table1} force""" + + String enabled = context.config.otherConfigs.get("enableJdbcTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String pg_port = context.config.otherConfigs.get("pg_14_port") + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String s3_endpoint = getS3Endpoint() + String bucket = getS3BucketName() + String driver_url = "https://${bucket}.${s3_endpoint}/regression/jdbc_driver/postgresql-42.5.0.jar" + + // create test + connect("${pgUser}", "${pgPassword}", "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}") { + // sql """CREATE SCHEMA IF NOT EXISTS ${pgSchema}""" + sql """DROP TABLE IF EXISTS ${pgDB}.${pgSchema}.${table1}""" + sql """CREATE TABLE ${pgDB}.${pgSchema}.${table1} ( + "name" varchar(200), + "age" int2 + )""" + sql """INSERT INTO ${pgDB}.${pgSchema}.${table1} (name, age) VALUES ('A1', 1);""" + } + + test { + sql """CREATE JOB ${jobName} + ON STREAMING + FROM POSTGRES ( + "jdbc_url" = "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}", + "driver_url" = "${driver_url}", + "driver_class" = "org.postgresql.Driver", + "user" = "${pgUser}", + "password" = "${pgPassword}", + "database" = "${pgDB}", + "schema" = "${pgSchema}", + "include_tables" = "${table1}", + "offset" = "initial" + ) + TO DATABASE ${currentDb} ( + "table.create.properties.replication_num" = "1" + ) + """ + exception "The following tables do not have primary key defined: ${table1}" + } + + def jobInfo = sql """ + select * from jobs("type"="insert") where Name='${jobName}' + """ + assert jobInfo.size()== 0 + + sql """ + DROP JOB IF EXISTS where jobname = '${jobName}' + """ + + def jobCountRsp = sql """select count(1) from jobs("type"="insert") where Name ='${jobName}'""" + assert jobCountRsp.get(0).get(0) == 0 + } +} diff --git a/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_postgres_job_priv.groovy b/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_postgres_job_priv.groovy new file mode 100644 index 00000000000000..6b70301e43dcd9 --- /dev/null +++ b/regression-test/suites/job_p0/streaming_job/cdc/test_streaming_postgres_job_priv.groovy @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.awaitility.Awaitility + +import static java.util.concurrent.TimeUnit.SECONDS + +suite("test_streaming_postgres_job_priv", "p0,external,pg,external_docker,external_docker_pg") { + def tableName = "test_streaming_postgres_job_priv_tbl" + def jobName = "test_streaming_postgres_job_priv_name" + def currentDb = (sql "select database()")[0][0] + def pgDB = "postgres" + def pgSchema = "cdc_test" + def pgUser = "postgres" + def pgPassword = "123456" + + sql """DROP JOB IF EXISTS where jobname = '${jobName}'""" + sql """drop table if exists ${currentDb}.${tableName} force""" + + String enabled = context.config.otherConfigs.get("enableJdbcTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String pg_port = context.config.otherConfigs.get("pg_14_port") + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String s3_endpoint = getS3Endpoint() + String bucket = getS3BucketName() + String driver_url = "https://${bucket}.${s3_endpoint}/regression/jdbc_driver/postgresql-42.5.0.jar" + + // create pg test table + connect("${pgUser}", "${pgPassword}", "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}") { + // sql """CREATE SCHEMA IF NOT EXISTS ${pgSchema}""" + sql """DROP TABLE IF EXISTS ${pgDB}.${pgSchema}.${tableName}""" + sql """CREATE TABLE ${pgDB}.${pgSchema}.${tableName} ( + "name" varchar(200), + "age" int2, + PRIMARY KEY ("name") + )""" + // mock snapshot data + sql """INSERT INTO ${pgDB}.${pgSchema}.${tableName} (name, age) VALUES ('A1', 1);""" + sql """INSERT INTO ${pgDB}.${pgSchema}.${tableName} (name, age) VALUES ('B1', 2);""" + } + + // create a new pg user only has select priv + def newPgUser = "test_job_priv_pg" + def newPgPassword = "test123" + connect("${pgUser}", "${pgPassword}", "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}") { + try { + sql """DROP OWNED BY ${newPgUser}""" + }catch (Exception e) { + log.info("Drop owned failed, maybe user not exist yet.") + } + sql """DROP ROLE IF EXISTS ${newPgUser}""" + sql """CREATE ROLE ${newPgUser} WITH LOGIN PASSWORD '${newPgPassword}'""" + sql """GRANT USAGE ON SCHEMA ${pgSchema} TO ${newPgUser}""" + sql """GRANT SELECT, INSERT ON ALL TABLES IN SCHEMA ${pgSchema} TO ${newPgUser}""" + } + + // create job by new user + sql """CREATE JOB ${jobName} + ON STREAMING + FROM POSTGRES ( + "jdbc_url" = "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}", + "driver_url" = "${driver_url}", + "driver_class" = "org.postgresql.Driver", + "user" = "${newPgUser}", + "password" = "${newPgPassword}", + "database" = "${pgDB}", + "schema" = "${pgSchema}", + "include_tables" = "${tableName}", + "offset" = "latest" + ) + TO DATABASE ${currentDb} ( + "table.create.properties.replication_num" = "1" + ) + """ + + // check job running + try { + Awaitility.await().atMost(300, SECONDS) + .pollInterval(3, SECONDS).until( + { + def jobStatus = sql """ select status, ErrorMsg from jobs("type"="insert") where Name = '${jobName}' and ExecuteType='STREAMING' """ + log.info("jobStatus: " + jobStatus) + // check job status + jobStatus.size() == 1 && 'RUNNING' == jobStatus.get(0).get(0) && jobStatus.get(0).get(1).contains("Failed to fetch meta") + } + ) + } catch (Exception ex){ + def showjob = sql """select * from jobs("type"="insert") where Name='${jobName}'""" + def showtask = sql """select * from tasks("type"="insert") where JobName='${jobName}'""" + log.info("show job: " + showjob) + log.info("show task: " + showtask) + throw ex; + } + + // grant replication to user + connect("${pgUser}", "${pgPassword}", "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}") { + sql """ALTER ROLE ${newPgUser} WITH REPLICATION""" + } + + + Awaitility.await().atMost(300, SECONDS) + .pollInterval(3, SECONDS).until( + { + def jobEndOffset = sql """select EndOffset from jobs("type"="insert") where Name='${jobName}'""" + log.info("jobEndOffset: " + jobEndOffset) + jobEndOffset.get(0).get(0).contains("lsn") + } + ) + + Awaitility.await().atMost(300, SECONDS) + .pollInterval(3, SECONDS).until( + { + def jobSucceedTaskCount = sql """select SucceedTaskCount from jobs("type"="insert") where Name='${jobName}'""" + log.info("jobSucceedTaskCount: " + jobSucceedTaskCount) + jobSucceedTaskCount.size() == 1 && jobSucceedTaskCount.get(0).get(0) >= '1' + } + ) + + // mock incremental into + connect("${newPgUser}", "${newPgPassword}", "jdbc:postgresql://${externalEnvIp}:${pg_port}/${pgDB}") { + sql """INSERT INTO ${pgDB}.${pgSchema}.${tableName} (name,age) VALUES ('Doris',18);""" + } + + sleep(30000) + + // check incremental data + qt_select """ SELECT * FROM ${tableName} order by name asc """ + + sql """ + DROP JOB IF EXISTS where jobname = '${jobName}' + """ + + def jobCountRsp = sql """select count(1) from jobs("type"="insert") where Name ='${jobName}'""" + assert jobCountRsp.get(0).get(0) == 0 + } +}