|
| 1 | +package com.marklogic.kafka.connect.sink; |
| 2 | + |
| 3 | +import com.fasterxml.jackson.databind.JsonNode; |
| 4 | +import com.fasterxml.jackson.databind.ObjectMapper; |
| 5 | +import com.fasterxml.jackson.databind.node.ObjectNode; |
| 6 | +import com.marklogic.client.DatabaseClient; |
| 7 | +import com.marklogic.client.dataservices.IOEndpoint; |
| 8 | +import com.marklogic.client.dataservices.InputCaller; |
| 9 | +import com.marklogic.client.document.DocumentWriteOperation; |
| 10 | +import com.marklogic.client.ext.DatabaseClientConfig; |
| 11 | +import com.marklogic.client.ext.DefaultConfiguredDatabaseClientFactory; |
| 12 | +import com.marklogic.client.io.BytesHandle; |
| 13 | +import com.marklogic.client.io.Format; |
| 14 | +import com.marklogic.client.io.JacksonHandle; |
| 15 | +import com.marklogic.client.io.StringHandle; |
| 16 | +import com.marklogic.client.io.marker.AbstractWriteHandle; |
| 17 | +import com.marklogic.kafka.connect.DefaultDatabaseClientConfigBuilder; |
| 18 | +import org.apache.kafka.clients.consumer.OffsetAndMetadata; |
| 19 | +import org.apache.kafka.common.TopicPartition; |
| 20 | +import org.apache.kafka.connect.sink.SinkRecord; |
| 21 | +import org.springframework.util.StringUtils; |
| 22 | + |
| 23 | +import java.util.Map; |
| 24 | + |
| 25 | +/** |
| 26 | + * Uses Bulk Data Services - https://github.com/marklogic/java-client-api/wiki/Bulk-Data-Services - to allow the user |
| 27 | + * to provide their own endpoint implementation, thus giving the user full control over how data is written to |
| 28 | + * MarkLogic. |
| 29 | + */ |
| 30 | +class BulkDataServicesSinkTask extends AbstractSinkTask { |
| 31 | + |
| 32 | + private DatabaseClient databaseClient; |
| 33 | + private InputCaller.BulkInputCaller<JsonNode> bulkInputCaller; |
| 34 | + private ObjectMapper objectMapper; |
| 35 | + private SinkRecordConverter sinkRecordConverter; |
| 36 | + |
| 37 | + public BulkDataServicesSinkTask() { |
| 38 | + this.objectMapper = new ObjectMapper(); |
| 39 | + } |
| 40 | + |
| 41 | + @Override |
| 42 | + protected void onStart(Map<String, Object> parsedConfig) { |
| 43 | + DatabaseClientConfig databaseClientConfig = new DefaultDatabaseClientConfigBuilder().buildDatabaseClientConfig(parsedConfig); |
| 44 | + this.databaseClient = new DefaultConfiguredDatabaseClientFactory().newDatabaseClient(databaseClientConfig); |
| 45 | + |
| 46 | + JacksonHandle modulesHandle = readApiDeclarationFromMarkLogic(parsedConfig, databaseClientConfig); |
| 47 | + InputCaller<JsonNode> inputCaller = InputCaller.on(databaseClient, modulesHandle, new JacksonHandle().withFormat(Format.JSON)); |
| 48 | + |
| 49 | + IOEndpoint.CallContext callContext = inputCaller.newCallContext() |
| 50 | + .withEndpointConstants(new JacksonHandle(buildEndpointConstants(parsedConfig))); |
| 51 | + this.bulkInputCaller = inputCaller.bulkCaller(callContext); |
| 52 | + this.configureErrorListenerOnBulkInputCaller(); |
| 53 | + |
| 54 | + this.sinkRecordConverter = new DefaultSinkRecordConverter(parsedConfig); |
| 55 | + } |
| 56 | + |
| 57 | + /** |
| 58 | + * When Kafka calls - the frequency of which can be controlled by the user - perform a synchronous flush of any |
| 59 | + * records waiting to be written to MarkLogic. {@code BulkInputCaller} does not yet have an asynchronous flush |
| 60 | + * like DMSDK does, but the use of a synchronous flush seems appropriate - i.e. Kafka seems to be okay with a |
| 61 | + * synchronous call here, while {@code put} is expected to be async. |
| 62 | + * <p> |
| 63 | + * For a good reference, see https://stackoverflow.com/questions/44871377/put-vs-flush-in-kafka-connector-sink-task |
| 64 | + * |
| 65 | + * @param currentOffsets |
| 66 | + */ |
| 67 | + @Override |
| 68 | + public void flush(Map<TopicPartition, OffsetAndMetadata> currentOffsets) { |
| 69 | + if (bulkInputCaller != null) { |
| 70 | + logger.info("Flushing BulkInputCaller"); |
| 71 | + bulkInputCaller.awaitCompletion(); |
| 72 | + logger.info("Finished flushing BulkInputCaller"); |
| 73 | + } |
| 74 | + } |
| 75 | + |
| 76 | + @Override |
| 77 | + public void stop() { |
| 78 | + flush(null); |
| 79 | + if (databaseClient != null) { |
| 80 | + databaseClient.release(); |
| 81 | + } |
| 82 | + } |
| 83 | + |
| 84 | + /** |
| 85 | + * Queues up the sink record for writing to MarkLogic. Once the batch size, as defined in the Bulk API declaration, |
| 86 | + * is reached, the {@code BulkInputCaller} will write the data to MarkLogic. |
| 87 | + * |
| 88 | + * @param sinkRecord |
| 89 | + */ |
| 90 | + @Override |
| 91 | + protected void writeSinkRecord(SinkRecord sinkRecord) { |
| 92 | + DocumentWriteOperation writeOp = sinkRecordConverter.convert(sinkRecord); |
| 93 | + JsonNode input = buildBulkDataServiceInput(writeOp, sinkRecord); |
| 94 | + bulkInputCaller.accept(input); |
| 95 | + } |
| 96 | + |
| 97 | + /** |
| 98 | + * Bulk Data Services requires that the API declaration exist in the modules database associated with the app |
| 99 | + * server that the connector will talk to. |
| 100 | + * |
| 101 | + * @param parsedConfig |
| 102 | + * @param databaseClientConfig |
| 103 | + * @return |
| 104 | + */ |
| 105 | + private JacksonHandle readApiDeclarationFromMarkLogic(Map<String, Object> parsedConfig, DatabaseClientConfig databaseClientConfig) { |
| 106 | + final String bulkApiUri = (String) parsedConfig.get(MarkLogicSinkConfig.BULK_DS_API_URI); |
| 107 | + final String modulesDatabase = (String) parsedConfig.get(MarkLogicSinkConfig.CONNECTION_MODULES_DATABASE); |
| 108 | + if (!StringUtils.hasText(modulesDatabase)) { |
| 109 | + throw new IllegalArgumentException("Cannot read Bulk Data Services API declaration at URI: " + bulkApiUri |
| 110 | + + "; no modules database configured. Please set the " |
| 111 | + + MarkLogicSinkConfig.CONNECTION_MODULES_DATABASE + " property."); |
| 112 | + } |
| 113 | + |
| 114 | + final String originalDatabase = databaseClientConfig.getDatabase(); |
| 115 | + try { |
| 116 | + databaseClientConfig.setDatabase(modulesDatabase); |
| 117 | + DatabaseClient modulesClient = new DefaultConfiguredDatabaseClientFactory().newDatabaseClient(databaseClientConfig); |
| 118 | + return modulesClient.newJSONDocumentManager().read(bulkApiUri, new JacksonHandle().withFormat(Format.JSON)); |
| 119 | + } catch (Exception ex) { |
| 120 | + // The stacktrace isn't of any value here for a user; the message below will provide sufficient information |
| 121 | + // for debugging |
| 122 | + throw new RuntimeException("Unable to read Bulk Data Services API declaration at URI: " + bulkApiUri + |
| 123 | + "; modules database: " + modulesDatabase + "; cause: " + ex.getMessage()); |
| 124 | + } finally { |
| 125 | + databaseClientConfig.setDatabase(originalDatabase); |
| 126 | + } |
| 127 | + } |
| 128 | + |
| 129 | + /** |
| 130 | + * When using Bulk Data Services, include all "ml.document" config options in the endpoint constants in case the |
| 131 | + * endpoint developer wishes to use these. |
| 132 | + * |
| 133 | + * @param parsedConfig |
| 134 | + * @return |
| 135 | + */ |
| 136 | + private ObjectNode buildEndpointConstants(Map<String, Object> parsedConfig) { |
| 137 | + ObjectNode endpointConstants = this.objectMapper.createObjectNode(); |
| 138 | + for (String key : parsedConfig.keySet()) { |
| 139 | + if (key.startsWith("ml.document")) { |
| 140 | + Object value = parsedConfig.get(key); |
| 141 | + if (value != null) { |
| 142 | + endpointConstants.put(key, value.toString()); |
| 143 | + } |
| 144 | + } |
| 145 | + } |
| 146 | + return endpointConstants; |
| 147 | + } |
| 148 | + |
| 149 | + /** |
| 150 | + * An envelope structure is used so that both the content and Kafka metadata from the sink record can be sent to |
| 151 | + * the endpoint. |
| 152 | + * |
| 153 | + * @param writeOp |
| 154 | + * @param sinkRecord |
| 155 | + * @return |
| 156 | + */ |
| 157 | + private JsonNode buildBulkDataServiceInput(DocumentWriteOperation writeOp, SinkRecord sinkRecord) { |
| 158 | + AbstractWriteHandle handle = writeOp.getContent(); |
| 159 | + // This assumes that the SinkRecordConverter always constructs either a BytesHandle or StringHandle. This is an |
| 160 | + // implementation detail not exposed to the user, and sufficient testing should ensure that this assumption |
| 161 | + // holds up over time. |
| 162 | + String content; |
| 163 | + if (handle instanceof BytesHandle) { |
| 164 | + content = new String(((BytesHandle) handle).get()); |
| 165 | + } else { |
| 166 | + content = ((StringHandle) handle).get(); |
| 167 | + } |
| 168 | + ObjectNode input = new ObjectMapper().createObjectNode(); |
| 169 | + input.put("content", content); |
| 170 | + |
| 171 | + ObjectNode kafkaMetadata = input.putObject("kafka-metadata"); |
| 172 | + kafkaMetadata.put("topic", sinkRecord.topic()); |
| 173 | + Object key = sinkRecord.key(); |
| 174 | + if (key != null) { |
| 175 | + kafkaMetadata.put("key", key.toString()); |
| 176 | + } |
| 177 | + kafkaMetadata.put("offset", sinkRecord.kafkaOffset()); |
| 178 | + Integer partition = sinkRecord.kafkaPartition(); |
| 179 | + if (partition != null) { |
| 180 | + kafkaMetadata.put("partition", partition); |
| 181 | + } |
| 182 | + Long timestamp = sinkRecord.timestamp(); |
| 183 | + if (timestamp != null) { |
| 184 | + kafkaMetadata.put("timestamp", timestamp); |
| 185 | + } |
| 186 | + return input; |
| 187 | + } |
| 188 | + |
| 189 | + /** |
| 190 | + * For the initial release of this capability, applying the "skip" approach that behaves in the same manner as |
| 191 | + * the existing WriteBatcher approach - i.e. log the failure and keep processing other records/batches. Can make |
| 192 | + * this configurable in the future if a client wants "stop all calls" support. |
| 193 | + */ |
| 194 | + private void configureErrorListenerOnBulkInputCaller() { |
| 195 | + this.bulkInputCaller.setErrorListener((retryCount, throwable, callContext, input) -> { |
| 196 | + // The stacktrace is not included here, as it will only contain references to Bulk Data Services code and |
| 197 | + // connector code, which won't help with debugging. The MarkLogic error log will be of much more value, |
| 198 | + // along with seeing the error message here. |
| 199 | + logger.error("Skipping failed write; cause: " + throwable.getMessage() + "; check the MarkLogic error " + |
| 200 | + "log file for additional information as to the cause of the failed write"); |
| 201 | + return IOEndpoint.BulkIOEndpointCaller.ErrorDisposition.SKIP_CALL; |
| 202 | + }); |
| 203 | + } |
| 204 | +} |
0 commit comments