Skip to content

Commit 6431783

Browse files
committed
Kafka source offset-based deduplication.
1 parent cb6e451 commit 6431783

File tree

11 files changed

+128
-15
lines changed

11 files changed

+128
-15
lines changed

sdks/java/core/src/main/java/org/apache/beam/sdk/io/UnboundedSource.java

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,13 @@ public boolean requiresDeduping() {
9494
}
9595

9696
/**
97-
* If isOffsetDeduplication returns true, then the UnboundedSource needs to
98-
* provide the following:
99-
*
97+
* If isOffsetDeduplication returns true, then the UnboundedSource needs to provide the following:
98+
*
10099
* <ul>
101-
* <li>UnboundedReader which provides offsets that are unique for each
102-
* element and lexicographically ordered.</li>
103-
* <li>CheckpointMark which provides an offset greater than all elements
104-
* read and less than or equal to the next offset that will be read.</li>
100+
* <li>UnboundedReader which provides offsets that are unique for each element and
101+
* lexicographically ordered.
102+
* <li>CheckpointMark which provides an offset greater than all elements read and less than or
103+
* equal to the next offset that will be read.
105104
* </ul>
106105
*/
107106
public boolean isOffsetDeduplication() {

runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OrderedCode.java renamed to sdks/java/core/src/main/java/org/apache/beam/sdk/util/OrderedCode.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* See the License for the specific language governing permissions and
1616
* limitations under the License.
1717
*/
18-
package org.apache.beam.runners.dataflow.worker;
18+
package org.apache.beam.sdk.util;
1919

2020
import java.math.RoundingMode;
2121
import java.util.ArrayList;

runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/OrderedCodeTest.java renamed to sdks/java/core/src/test/java/org/apache/beam/sdk/util/OrderedCodeTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* See the License for the specific language governing permissions and
1616
* limitations under the License.
1717
*/
18-
package org.apache.beam.runners.dataflow.worker;
18+
package org.apache.beam.sdk.util;
1919

2020
import static org.junit.Assert.assertArrayEquals;
2121
import static org.junit.Assert.assertEquals;

sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaCheckpointMark.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
*/
1818
package org.apache.beam.sdk.io.kafka;
1919

20+
import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState;
21+
2022
import java.io.Serializable;
2123
import java.util.List;
2224
import java.util.Optional;
@@ -66,6 +68,23 @@ public String toString() {
6668
return "KafkaCheckpointMark{partitions=" + Joiner.on(",").join(partitions) + '}';
6769
}
6870

71+
@Override
72+
public byte[] getOffsetLimit() {
73+
if (!reader.isPresent()) {
74+
throw new RuntimeException(
75+
"KafkaCheckpointMark reader is not present.");
76+
}
77+
if (!reader.get().offsetDeduplication()) {
78+
throw new RuntimeException(
79+
"Unexpected getOffsetLimit() called while KafkaUnboundedReader not configured for offset deduplication.");
80+
}
81+
82+
// KafkaUnboundedSource.split() must produce a 1:1 partition to split ratio.
83+
checkState(partitions.size() == 1);
84+
PartitionMark partition = partitions.get(0);
85+
return KafkaIOUtils.getOrderedCode(partition.getNextOffset());
86+
}
87+
6988
/**
7089
* A tuple to hold topic, partition, and offset that comprise the checkpoint for a single
7190
* partition.

sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -610,6 +610,7 @@ public static <K, V> Read<K, V> read() {
610610
.setRedistributed(false)
611611
.setAllowDuplicates(false)
612612
.setRedistributeNumKeys(0)
613+
.setOffsetDeduplication(false)
613614
.build();
614615
}
615616

@@ -717,6 +718,9 @@ public abstract static class Read<K, V>
717718
@Pure
718719
public abstract int getRedistributeNumKeys();
719720

721+
@Pure
722+
public abstract boolean isOffsetDeduplication();
723+
720724
@Pure
721725
public abstract @Nullable Duration getWatchTopicPartitionDuration();
722726

@@ -782,6 +786,8 @@ abstract Builder<K, V> setConsumerFactoryFn(
782786

783787
abstract Builder<K, V> setRedistributeNumKeys(int redistributeNumKeys);
784788

789+
abstract Builder<K, V> setOffsetDeduplication(boolean offsetDeduplication);
790+
785791
abstract Builder<K, V> setTimestampPolicyFactory(
786792
TimestampPolicyFactory<K, V> timestampPolicyFactory);
787793

@@ -892,6 +898,10 @@ static <K, V> void setupExternalBuilder(
892898
builder.setRedistributeNumKeys(0);
893899
builder.setAllowDuplicates(false);
894900
}
901+
// TODO(tomstepp): Auto-enable offset deduplication if: redistributed and !allowDuplicates.
902+
if (config.offsetDeduplication != null) {
903+
builder.setOffsetDeduplication(config.offsetDeduplication);
904+
}
895905
}
896906

897907
private static <T> Coder<T> resolveCoder(Class<Deserializer<T>> deserializer) {
@@ -959,6 +969,7 @@ public static class Configuration {
959969
private Integer redistributeNumKeys;
960970
private Boolean redistribute;
961971
private Boolean allowDuplicates;
972+
private Boolean offsetDeduplication;
962973

963974
public void setConsumerConfig(Map<String, String> consumerConfig) {
964975
this.consumerConfig = consumerConfig;
@@ -1015,6 +1026,10 @@ public void setRedistribute(Boolean redistribute) {
10151026
public void setAllowDuplicates(Boolean allowDuplicates) {
10161027
this.allowDuplicates = allowDuplicates;
10171028
}
1029+
1030+
public void setOffsetDeduplication(Boolean offsetDeduplication) {
1031+
this.offsetDeduplication = offsetDeduplication;
1032+
}
10181033
}
10191034
}
10201035

@@ -1086,6 +1101,10 @@ public Read<K, V> withRedistributeNumKeys(int redistributeNumKeys) {
10861101
return toBuilder().setRedistributeNumKeys(redistributeNumKeys).build();
10871102
}
10881103

1104+
public Read<K, V> withOffsetDeduplication(boolean offsetDeduplication) {
1105+
return toBuilder().setOffsetDeduplication(offsetDeduplication).build();
1106+
}
1107+
10891108
/**
10901109
* Internally sets a {@link java.util.regex.Pattern} of topics to read from. All the partitions
10911110
* from each of the matching topics are read.

sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIOReadImplementationCompatibility.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,12 @@ Object getDefaultValue() {
137137
return false;
138138
}
139139
},
140+
OFFSET_DEDUPLICATION(LEGACY) {
141+
@Override
142+
Object getDefaultValue() {
143+
return false;
144+
}
145+
}
140146
;
141147

142148
private final @NonNull ImmutableSet<KafkaIOReadImplementation> supportedImplementations;

sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIOUtils.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,12 @@
1919

2020
import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument;
2121

22+
import java.nio.charset.StandardCharsets;
2223
import java.util.HashMap;
2324
import java.util.Map;
2425
import java.util.Random;
2526
import org.apache.beam.sdk.transforms.SerializableFunction;
27+
import org.apache.beam.sdk.util.OrderedCode;
2628
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap;
2729
import org.apache.kafka.clients.consumer.Consumer;
2830
import org.apache.kafka.clients.consumer.ConsumerConfig;
@@ -142,4 +144,14 @@ void update(double quantity) {
142144
return avg;
143145
}
144146
}
147+
148+
static byte[] getOrderedCode(long offset) {
149+
OrderedCode orderedCode = new OrderedCode();
150+
orderedCode.writeNumIncreasing(offset);
151+
return orderedCode.getEncodedBytes();
152+
}
153+
154+
static byte[] getUniqueId(String topic, int partition, long offset) {
155+
return (topic + "-" + partition + "-" + offset).getBytes(StandardCharsets.UTF_8);
156+
}
145157
}

sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaUnboundedReader.java

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,10 @@ public boolean advance() throws IOException {
214214
curTimestamp =
215215
pState.timestampPolicy.getTimestampForRecord(pState.mkTimestampPolicyContext(), record);
216216
curRecord = record;
217+
if (this.offsetDeduplication) {
218+
curOffset = KafkaIOUtils.getOrderedCode(offset);
219+
curId = KafkaIOUtils.getUniqueId(rawRecord.topic(), rawRecord.partition(), rawRecord.offset());
220+
}
217221

218222
int recordSize =
219223
(rawRecord.key() == null ? 0 : rawRecord.key().length)
@@ -299,6 +303,30 @@ public Instant getCurrentTimestamp() throws NoSuchElementException {
299303
return curTimestamp;
300304
}
301305

306+
@Override
307+
public byte[] getCurrentRecordId() throws NoSuchElementException {
308+
if (curId == null) {
309+
if (this.offsetDeduplication) {
310+
throw new NoSuchElementException();
311+
} else {
312+
return new byte[0];
313+
}
314+
}
315+
return curId;
316+
}
317+
318+
@Override
319+
public byte[] getCurrentRecordOffset() throws NoSuchElementException {
320+
if (curOffset == null) {
321+
if (this.offsetDeduplication) {
322+
throw new NoSuchElementException();
323+
} else {
324+
return new byte[0];
325+
}
326+
}
327+
return curOffset;
328+
}
329+
302330
@Override
303331
public long getSplitBacklogBytes() {
304332
long backlogBytes = 0;
@@ -314,6 +342,10 @@ public long getSplitBacklogBytes() {
314342
return backlogBytes;
315343
}
316344

345+
public boolean offsetDeduplication() {
346+
return offsetDeduplication;
347+
}
348+
317349
////////////////////////////////////////////////////////////////////////////////////////////////
318350

319351
private static final Logger LOG = LoggerFactory.getLogger(KafkaUnboundedReader.class);
@@ -336,6 +368,10 @@ public long getSplitBacklogBytes() {
336368
private @Nullable Instant curTimestamp = null;
337369
private Iterator<PartitionState<K, V>> curBatch = Collections.emptyIterator();
338370

371+
private final boolean offsetDeduplication;
372+
private byte[] curOffset = new byte[0];
373+
private byte[] curId = new byte[0];
374+
339375
private @Nullable Deserializer<K> keyDeserializerInstance = null;
340376
private @Nullable Deserializer<V> valueDeserializerInstance = null;
341377

@@ -507,6 +543,7 @@ Instant updateAndGetWatermark() {
507543
KafkaUnboundedSource<K, V> source, @Nullable KafkaCheckpointMark checkpointMark) {
508544
this.source = source;
509545
this.name = "Reader-" + source.getId();
546+
this.offsetDeduplication = source.isOffsetDeduplication();
510547

511548
List<TopicPartition> partitions =
512549
Preconditions.checkArgumentNotNull(source.getSpec().getTopicPartitions());

sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaUnboundedSource.java

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,16 @@ public List<KafkaUnboundedSource<K, V>> split(int desiredNumSplits, PipelineOpti
113113
partitions.size() > 0,
114114
"Could not find any partitions. Please check Kafka configuration and topic names");
115115

116-
int numSplits = Math.min(desiredNumSplits, partitions.size());
117-
// XXX make all splits have the same # of partitions
118-
while (partitions.size() % numSplits > 0) {
119-
++numSplits;
116+
int numSplits;
117+
if (isOffsetDeduplication()) {
118+
// Enforce 1:1 split to partition ratio for offset deduplication.
119+
numSplits = partitions.size();
120+
} else {
121+
numSplits = Math.min(desiredNumSplits, partitions.size());
122+
// Make all splits have the same # of partitions.
123+
while (partitions.size() % numSplits > 0) {
124+
++numSplits;
125+
}
120126
}
121127
List<List<TopicPartition>> assignments = new ArrayList<>(numSplits);
122128

@@ -177,6 +183,11 @@ public boolean requiresDeduping() {
177183
return false;
178184
}
179185

186+
@Override
187+
public boolean isOffsetDeduplication() {
188+
return spec.isOffsetDeduplication();
189+
}
190+
180191
@Override
181192
public Coder<KafkaRecord<K, V>> getOutputCoder() {
182193
Coder<K> keyCoder = Preconditions.checkStateNotNull(spec.getKeyCoder());

sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOExternalTest.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ public void testConstructKafkaRead() throws Exception {
111111
Field.of("consumer_polling_timeout", FieldType.INT64),
112112
Field.of("redistribute_num_keys", FieldType.INT32),
113113
Field.of("redistribute", FieldType.BOOLEAN),
114-
Field.of("allow_duplicates", FieldType.BOOLEAN)))
114+
Field.of("allow_duplicates", FieldType.BOOLEAN),
115+
Field.of("offset_deduplication", FieldType.BOOLEAN)))
115116
.withFieldValue("topics", topics)
116117
.withFieldValue("consumer_config", consumerConfig)
117118
.withFieldValue("key_deserializer", keyDeserializer)
@@ -123,6 +124,7 @@ public void testConstructKafkaRead() throws Exception {
123124
.withFieldValue("redistribute_num_keys", 0)
124125
.withFieldValue("redistribute", false)
125126
.withFieldValue("allow_duplicates", false)
127+
.withFieldValue("offset_deduplication", false)
126128
.build());
127129

128130
RunnerApi.Components defaultInstance = RunnerApi.Components.getDefaultInstance();
@@ -247,7 +249,8 @@ public void testConstructKafkaReadWithoutMetadata() throws Exception {
247249
Field.of("timestamp_policy", FieldType.STRING),
248250
Field.of("redistribute_num_keys", FieldType.INT32),
249251
Field.of("redistribute", FieldType.BOOLEAN),
250-
Field.of("allow_duplicates", FieldType.BOOLEAN)))
252+
Field.of("allow_duplicates", FieldType.BOOLEAN),
253+
Field.of("offset_deduplication", FieldType.BOOLEAN)))
251254
.withFieldValue("topics", topics)
252255
.withFieldValue("consumer_config", consumerConfig)
253256
.withFieldValue("key_deserializer", keyDeserializer)
@@ -258,6 +261,7 @@ public void testConstructKafkaReadWithoutMetadata() throws Exception {
258261
.withFieldValue("redistribute_num_keys", 0)
259262
.withFieldValue("redistribute", false)
260263
.withFieldValue("allow_duplicates", false)
264+
.withFieldValue("offset_deduplication", false)
261265
.build());
262266

263267
RunnerApi.Components defaultInstance = RunnerApi.Components.getDefaultInstance();

0 commit comments

Comments
 (0)