|
24 | 24 | import com.google.common.util.concurrent.ThreadFactoryBuilder; |
25 | 25 | import java.io.IOException; |
26 | 26 | import java.nio.ByteBuffer; |
| 27 | +import java.util.Arrays; |
27 | 28 | import java.util.Iterator; |
28 | 29 | import java.util.List; |
29 | 30 | import java.util.Map; |
30 | 31 | import java.util.concurrent.ExecutorService; |
31 | 32 | import java.util.concurrent.Executors; |
32 | 33 | import java.util.concurrent.TimeUnit; |
| 34 | +import java.util.stream.Collectors; |
33 | 35 | import lombok.extern.slf4j.Slf4j; |
34 | 36 | import org.apache.pulsar.client.api.Schema; |
35 | 37 | import org.apache.pulsar.client.api.schema.GenericRecord; |
@@ -119,57 +121,65 @@ private void flush(Map<String, List<Record<GenericRecord>>> recordsToInsertByTop |
119 | 121 | final long timeStampForPartitioning = System.currentTimeMillis(); |
120 | 122 | for (Map.Entry<String, List<Record<GenericRecord>>> entry : recordsToInsertByTopic.entrySet()) { |
121 | 123 | String topicName = entry.getKey(); |
122 | | - List<Record<GenericRecord>> singleTopicRecordsToInsert = entry.getValue(); |
123 | | - Record<GenericRecord> firstRecord = singleTopicRecordsToInsert.get(0); |
124 | | - Schema<GenericRecord> schema; |
125 | | - try { |
126 | | - schema = getPulsarSchema(firstRecord); |
127 | | - } catch (Exception e) { |
128 | | - log.error("Failed to retrieve message schema", e); |
129 | | - bulkHandleFailedRecords(e, singleTopicRecordsToInsert); |
130 | | - return; |
131 | | - } |
132 | | - |
133 | | - if (!format.doSupportPulsarSchemaType(schema.getSchemaInfo().getType())) { |
134 | | - String errorMsg = "Sink does not support schema type of pulsar: " + schema.getSchemaInfo().getType(); |
135 | | - log.error(errorMsg); |
136 | | - bulkHandleFailedRecords(new UnsupportedOperationException(errorMsg), singleTopicRecordsToInsert); |
137 | | - return; |
138 | | - } |
| 124 | + Map<String, List<Record<GenericRecord>>> schemaVersionMap = entry.getValue().stream() |
| 125 | + .collect(Collectors.groupingBy(r -> { |
| 126 | + byte[] schemaVersionBytes = r.getValue().getSchemaVersion(); |
| 127 | + return schemaVersionBytes != null ? Arrays.toString(schemaVersionBytes) : "null"; |
| 128 | + } |
| 129 | + )); |
| 130 | + for (List<Record<GenericRecord>> singleTopicRecordsToInsert : schemaVersionMap.values()) { |
| 131 | + Record<GenericRecord> firstRecord = singleTopicRecordsToInsert.get(0); |
| 132 | + Schema<GenericRecord> schema; |
| 133 | + try { |
| 134 | + schema = getPulsarSchema(firstRecord); |
| 135 | + } catch (Exception e) { |
| 136 | + log.error("Failed to retrieve message schema", e); |
| 137 | + bulkHandleFailedRecords(e, singleTopicRecordsToInsert); |
| 138 | + return; |
| 139 | + } |
139 | 140 |
|
140 | | - String filepath = ""; |
141 | | - try { |
142 | | - format.initSchema(schema); |
143 | | - final Iterator<Record<GenericRecord>> iter = singleTopicRecordsToInsert.iterator(); |
144 | | - filepath = buildPartitionPath(firstRecord, partitioner, format, timeStampForPartitioning); |
145 | | - ByteBuffer payload = bindValue(iter, format); |
146 | | - int uploadSize = singleTopicRecordsToInsert.size(); |
147 | | - long uploadBytes = getBytesSum(singleTopicRecordsToInsert); |
148 | | - log.info("Uploading blob {} from topic {} uploadSize:{} uploadBytes:{} currentBatchStatus:{}", |
149 | | - filepath, topicName, uploadSize, uploadBytes, batchManager.getCurrentStatsStr()); |
150 | | - long elapsedMs = System.currentTimeMillis(); |
151 | | - blobWriter.uploadBlob(filepath, payload); |
152 | | - elapsedMs = System.currentTimeMillis() - elapsedMs; |
153 | | - log.debug("Uploading blob {} elapsed time in ms: {}", filepath, elapsedMs); |
154 | | - singleTopicRecordsToInsert.forEach(Record::ack); |
155 | | - if (sinkContext != null) { |
156 | | - sinkContext.recordMetric(METRICS_TOTAL_SUCCESS, singleTopicRecordsToInsert.size()); |
157 | | - sinkContext.recordMetric(METRICS_LATEST_UPLOAD_ELAPSED_TIME, elapsedMs); |
| 141 | + if (!format.doSupportPulsarSchemaType(schema.getSchemaInfo().getType())) { |
| 142 | + String errorMsg = "Sink does not support schema type of pulsar: " |
| 143 | + + schema.getSchemaInfo().getType(); |
| 144 | + log.error(errorMsg); |
| 145 | + bulkHandleFailedRecords(new UnsupportedOperationException(errorMsg), singleTopicRecordsToInsert); |
| 146 | + return; |
158 | 147 | } |
159 | | - log.info("Successfully uploaded blob {} from topic {} uploadSize {} uploadBytes {}", |
160 | | - filepath, entry.getKey(), |
161 | | - uploadSize, uploadBytes); |
162 | | - } catch (Exception e) { |
163 | | - if (e instanceof ContainerNotFoundException) { |
164 | | - log.error("Blob {} is not found", filepath, e); |
165 | | - } else if (e instanceof IOException) { |
166 | | - log.error("Failed to write to blob {}", filepath, e); |
167 | | - } else if (e instanceof UnsupportedOperationException || e instanceof IllegalArgumentException) { |
168 | | - log.error("Failed to handle message schema {}", schema, e); |
169 | | - } else { |
170 | | - log.error("Encountered unknown error writing to blob {}", filepath, e); |
| 148 | + |
| 149 | + String filepath = ""; |
| 150 | + try { |
| 151 | + format.initSchema(schema); |
| 152 | + final Iterator<Record<GenericRecord>> iter = singleTopicRecordsToInsert.iterator(); |
| 153 | + filepath = buildPartitionPath(firstRecord, partitioner, format, timeStampForPartitioning); |
| 154 | + ByteBuffer payload = bindValue(iter, format); |
| 155 | + int uploadSize = singleTopicRecordsToInsert.size(); |
| 156 | + long uploadBytes = getBytesSum(singleTopicRecordsToInsert); |
| 157 | + log.info("Uploading blob {} from topic {} uploadSize:{} uploadBytes:{} currentBatchStatus:{}", |
| 158 | + filepath, topicName, uploadSize, uploadBytes, batchManager.getCurrentStatsStr()); |
| 159 | + long elapsedMs = System.currentTimeMillis(); |
| 160 | + blobWriter.uploadBlob(filepath, payload); |
| 161 | + elapsedMs = System.currentTimeMillis() - elapsedMs; |
| 162 | + log.debug("Uploading blob {} elapsed time in ms: {}", filepath, elapsedMs); |
| 163 | + singleTopicRecordsToInsert.forEach(Record::ack); |
| 164 | + if (sinkContext != null) { |
| 165 | + sinkContext.recordMetric(METRICS_TOTAL_SUCCESS, singleTopicRecordsToInsert.size()); |
| 166 | + sinkContext.recordMetric(METRICS_LATEST_UPLOAD_ELAPSED_TIME, elapsedMs); |
| 167 | + } |
| 168 | + log.info("Successfully uploaded blob {} from topic {} uploadSize {} uploadBytes {}", |
| 169 | + filepath, topicName, |
| 170 | + uploadSize, uploadBytes); |
| 171 | + } catch (Exception e) { |
| 172 | + if (e instanceof ContainerNotFoundException) { |
| 173 | + log.error("Blob {} is not found", filepath, e); |
| 174 | + } else if (e instanceof IOException) { |
| 175 | + log.error("Failed to write to blob {}", filepath, e); |
| 176 | + } else if (e instanceof UnsupportedOperationException || e instanceof IllegalArgumentException) { |
| 177 | + log.error("Failed to handle message schema {}", schema, e); |
| 178 | + } else { |
| 179 | + log.error("Encountered unknown error writing to blob {}", filepath, e); |
| 180 | + } |
| 181 | + bulkHandleFailedRecords(e, singleTopicRecordsToInsert); |
171 | 182 | } |
172 | | - bulkHandleFailedRecords(e, singleTopicRecordsToInsert); |
173 | 183 | } |
174 | 184 | } |
175 | 185 | } |
|
0 commit comments