Skip to content

Commit fd4c04d

Browse files
committed
Use currentLag and position to set end offset estimate
1 parent 5a00d73 commit fd4c04d

File tree

8 files changed

+36
-151
lines changed

8 files changed

+36
-151
lines changed

buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -629,7 +629,7 @@ class BeamModulePlugin implements Plugin<Project> {
629629
def jaxb_api_version = "2.3.3"
630630
def jsr305_version = "3.0.2"
631631
def everit_json_version = "1.14.2"
632-
def kafka_version = "2.8.2"
632+
def kafka_version = "3.9.1"
633633
def log4j2_version = "2.20.0"
634634
def nemo_version = "0.1"
635635
// [bomupgrader] determined by: io.grpc:grpc-netty, consistent with: google_cloud_platform_libraries_bom
@@ -832,7 +832,7 @@ class BeamModulePlugin implements Plugin<Project> {
832832
jupiter_api : "org.junit.jupiter:junit-jupiter-api:$jupiter_version",
833833
jupiter_engine : "org.junit.jupiter:junit-jupiter-engine:$jupiter_version",
834834
jupiter_params : "org.junit.jupiter:junit-jupiter-params:$jupiter_version",
835-
kafka : "org.apache.kafka:kafka_2.11:$kafka_version",
835+
kafka : "org.apache.kafka:kafka_2.12:$kafka_version",
836836
kafka_clients : "org.apache.kafka:kafka-clients:$kafka_version",
837837
log4j : "log4j:log4j:1.2.17",
838838
log4j_over_slf4j : "org.slf4j:log4j-over-slf4j:$slf4j_version",

sdks/java/io/kafka/build.gradle

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,7 @@ ext {
3636
}
3737

3838
def kafkaVersions = [
39-
'282': "2.8.2",
40-
'312': "3.1.2",
41-
'390': "3.9.0",
39+
'391': "3.9.1",
4240
]
4341

4442
kafkaVersions.each{k,v -> configurations.create("kafkaVersion$k")}

sdks/java/io/kafka/kafka-312/build.gradle

Lines changed: 0 additions & 24 deletions
This file was deleted.

sdks/java/io/kafka/kafka-390/build.gradle

Lines changed: 0 additions & 24 deletions
This file was deleted.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
* limitations under the License.
1717
*/
1818
project.ext {
19-
delimited="2.8.2"
20-
undelimited="282"
19+
delimited="3.9.1"
20+
undelimited="391"
2121
sdfCompatible=true
2222
}
2323

sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java

Lines changed: 27 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,14 @@
1919

2020
import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState;
2121

22-
import java.io.Closeable;
2322
import java.math.BigDecimal;
2423
import java.math.MathContext;
2524
import java.time.Duration;
2625
import java.util.Collections;
2726
import java.util.List;
2827
import java.util.Map;
2928
import java.util.Optional;
30-
import java.util.function.Supplier;
29+
import java.util.concurrent.atomic.AtomicLong;
3130
import org.apache.beam.sdk.coders.Coder;
3231
import org.apache.beam.sdk.io.kafka.KafkaIO.ReadSourceDescriptors;
3332
import org.apache.beam.sdk.io.kafka.KafkaIOUtils.MovingAvg;
@@ -48,7 +47,6 @@
4847
import org.apache.beam.sdk.transforms.splittabledofn.WatermarkEstimator;
4948
import org.apache.beam.sdk.transforms.splittabledofn.WatermarkEstimators.MonotonicallyIncreasing;
5049
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
51-
import org.apache.beam.sdk.util.ExpiringMemoizingSerializableSupplier;
5250
import org.apache.beam.sdk.util.MemoizingPerInstantiationSerializableSupplier;
5351
import org.apache.beam.sdk.util.Preconditions;
5452
import org.apache.beam.sdk.util.SerializableSupplier;
@@ -70,6 +68,7 @@
7068
import org.apache.kafka.clients.consumer.ConsumerConfig;
7169
import org.apache.kafka.clients.consumer.ConsumerRecord;
7270
import org.apache.kafka.clients.consumer.ConsumerRecords;
71+
import org.apache.kafka.common.KafkaException;
7372
import org.apache.kafka.common.PartitionInfo;
7473
import org.apache.kafka.common.TopicPartition;
7574
import org.apache.kafka.common.config.ConfigDef;
@@ -225,30 +224,12 @@ public MovingAvg load(KafkaSourceDescriptor kafkaSourceDescriptor)
225224
CacheBuilder.newBuilder()
226225
.concurrencyLevel(Runtime.getRuntime().availableProcessors())
227226
.weakValues()
228-
.removalListener(
229-
(RemovalNotification<KafkaSourceDescriptor, KafkaLatestOffsetEstimator>
230-
notification) -> {
231-
final @Nullable KafkaLatestOffsetEstimator value;
232-
if (notification.getCause() == RemovalCause.COLLECTED
233-
&& (value = notification.getValue()) != null) {
234-
value.close();
235-
}
236-
})
237227
.build(
238-
new CacheLoader<KafkaSourceDescriptor, KafkaLatestOffsetEstimator>() {
228+
new CacheLoader<KafkaSourceDescriptor, AtomicLong>() {
239229
@Override
240-
public KafkaLatestOffsetEstimator load(
241-
final KafkaSourceDescriptor sourceDescriptor) {
242-
LOG.info(
243-
"Creating Kafka consumer for offset estimation for {}",
244-
sourceDescriptor);
245-
final Map<String, Object> config =
246-
KafkaIOUtils.overrideBootstrapServersConfig(
247-
consumerConfig, sourceDescriptor);
248-
final Consumer<byte[], byte[]> consumer =
249-
consumerFactoryFn.apply(config);
250-
return new KafkaLatestOffsetEstimator(
251-
consumer, sourceDescriptor.getTopicPartition());
230+
public AtomicLong load(final KafkaSourceDescriptor sourceDescriptor) {
231+
LOG.info("Creating end offset estimator for {}", sourceDescriptor);
232+
return new AtomicLong(Long.MIN_VALUE);
252233
}
253234
}));
254235
this.pollConsumerCacheSupplier =
@@ -309,8 +290,7 @@ public Consumer<byte[], byte[]> load(
309290
private final SerializableSupplier<LoadingCache<KafkaSourceDescriptor, MovingAvg>>
310291
avgRecordSizeCacheSupplier;
311292

312-
private final SerializableSupplier<
313-
LoadingCache<KafkaSourceDescriptor, KafkaLatestOffsetEstimator>>
293+
private final SerializableSupplier<LoadingCache<KafkaSourceDescriptor, AtomicLong>>
314294
latestOffsetEstimatorCacheSupplier;
315295

316296
private final SerializableSupplier<LoadingCache<KafkaSourceDescriptor, Consumer<byte[], byte[]>>>
@@ -319,8 +299,7 @@ public Consumer<byte[], byte[]> load(
319299
private transient @MonotonicNonNull LoadingCache<KafkaSourceDescriptor, MovingAvg>
320300
avgRecordSizeCache;
321301

322-
private transient @MonotonicNonNull LoadingCache<
323-
KafkaSourceDescriptor, KafkaLatestOffsetEstimator>
302+
private transient @MonotonicNonNull LoadingCache<KafkaSourceDescriptor, AtomicLong>
324303
latestOffsetEstimatorCache;
325304

326305
private transient @MonotonicNonNull LoadingCache<KafkaSourceDescriptor, Consumer<byte[], byte[]>>
@@ -339,46 +318,6 @@ public Consumer<byte[], byte[]> load(
339318
@VisibleForTesting
340319
static final String RAW_SIZE_METRIC_PREFIX = KafkaUnboundedReader.RAW_SIZE_METRIC_PREFIX;
341320

342-
/**
343-
* A {@link GrowableOffsetRangeTracker.RangeEndEstimator} which uses a Kafka {@link Consumer} to
344-
* fetch backlog.
345-
*/
346-
private static class KafkaLatestOffsetEstimator
347-
implements GrowableOffsetRangeTracker.RangeEndEstimator, Closeable {
348-
private final Consumer<byte[], byte[]> offsetConsumer;
349-
private final Supplier<Long> offsetSupplier;
350-
351-
KafkaLatestOffsetEstimator(
352-
final Consumer<byte[], byte[]> offsetConsumer, final TopicPartition topicPartition) {
353-
this.offsetConsumer = offsetConsumer;
354-
this.offsetSupplier =
355-
new ExpiringMemoizingSerializableSupplier<>(
356-
() -> {
357-
try {
358-
return offsetConsumer
359-
.endOffsets(Collections.singleton(topicPartition))
360-
.getOrDefault(topicPartition, Long.MIN_VALUE);
361-
} catch (Throwable t) {
362-
LOG.error("Failed to get end offset for {}", topicPartition, t);
363-
return Long.MIN_VALUE;
364-
}
365-
},
366-
Duration.ofSeconds(1),
367-
Long.MIN_VALUE,
368-
Duration.ZERO);
369-
}
370-
371-
@Override
372-
public long estimate() {
373-
return offsetSupplier.get();
374-
}
375-
376-
@Override
377-
public void close() {
378-
offsetConsumer.close();
379-
}
380-
}
381-
382321
@GetInitialRestriction
383322
@RequiresNonNull({"pollConsumerCache"})
384323
public OffsetRange initialRestriction(@Element KafkaSourceDescriptor kafkaSourceDescriptor) {
@@ -490,8 +429,8 @@ public double getSize(
490429
@RequiresNonNull({"latestOffsetEstimatorCache"})
491430
public OffsetRangeTracker restrictionTracker(
492431
@Element KafkaSourceDescriptor kafkaSourceDescriptor, @Restriction OffsetRange restriction) {
493-
final LoadingCache<KafkaSourceDescriptor, KafkaLatestOffsetEstimator>
494-
latestOffsetEstimatorCache = this.latestOffsetEstimatorCache;
432+
final LoadingCache<KafkaSourceDescriptor, AtomicLong> latestOffsetEstimatorCache =
433+
this.latestOffsetEstimatorCache;
495434

496435
if (restriction.getTo() < Long.MAX_VALUE) {
497436
return new OffsetRangeTracker(restriction);
@@ -500,8 +439,9 @@ public OffsetRangeTracker restrictionTracker(
500439
// OffsetEstimators are cached for each topic-partition because they hold a stateful connection,
501440
// so we want to minimize the amount of connections that we start and track with Kafka. Another
502441
// point is that it has a memoized backlog, and this should make that more reusable estimations.
503-
return new GrowableOffsetRangeTracker(
504-
restriction.getFrom(), latestOffsetEstimatorCache.getUnchecked(kafkaSourceDescriptor));
442+
final AtomicLong latestOffsetEstimator =
443+
latestOffsetEstimatorCache.getUnchecked(kafkaSourceDescriptor);
444+
return new GrowableOffsetRangeTracker(restriction.getFrom(), latestOffsetEstimator::get);
505445
}
506446

507447
@ProcessElement
@@ -514,14 +454,13 @@ public ProcessContinuation processElement(
514454
throws Exception {
515455
final LoadingCache<KafkaSourceDescriptor, MovingAvg> avgRecordSizeCache =
516456
this.avgRecordSizeCache;
517-
final LoadingCache<KafkaSourceDescriptor, KafkaLatestOffsetEstimator>
518-
latestOffsetEstimatorCache = this.latestOffsetEstimatorCache;
457+
final LoadingCache<KafkaSourceDescriptor, AtomicLong> latestOffsetEstimatorCache =
458+
this.latestOffsetEstimatorCache;
519459
final LoadingCache<KafkaSourceDescriptor, Consumer<byte[], byte[]>> pollConsumerCache =
520460
this.pollConsumerCache;
521461

522462
final MovingAvg avgRecordSize = avgRecordSizeCache.get(kafkaSourceDescriptor);
523-
final KafkaLatestOffsetEstimator latestOffsetEstimator =
524-
latestOffsetEstimatorCache.get(kafkaSourceDescriptor);
463+
final AtomicLong latestOffsetEstimator = latestOffsetEstimatorCache.get(kafkaSourceDescriptor);
525464
final Consumer<byte[], byte[]> consumer = pollConsumerCache.get(kafkaSourceDescriptor);
526465
final Deserializer<K> keyDeserializerInstance =
527466
Preconditions.checkStateNotNull(this.keyDeserializerInstance);
@@ -569,6 +508,14 @@ public ProcessContinuation processElement(
569508
// Fetch the next records.
570509
final ConsumerRecords<byte[], byte[]> rawRecords = consumer.poll(remainingTimeout);
571510
final Duration elapsed = pollTimer.elapsed();
511+
try {
512+
final long position = consumer.position(topicPartition);
513+
consumer
514+
.currentLag(topicPartition)
515+
.ifPresent(lag -> latestOffsetEstimator.lazySet(position + lag));
516+
} catch (KafkaException e) {
517+
}
518+
572519
try {
573520
remainingTimeout = remainingTimeout.minus(elapsed);
574521
} catch (ArithmeticException e) {
@@ -676,7 +623,7 @@ public ProcessContinuation processElement(
676623

677624
final long estimatedBacklogBytes =
678625
(long)
679-
(BigDecimal.valueOf(latestOffsetEstimator.estimate())
626+
(BigDecimal.valueOf(latestOffsetEstimator.get())
680627
.subtract(BigDecimal.valueOf(expectedOffset), MathContext.DECIMAL128)
681628
.doubleValue()
682629
* avgRecordSize.get());
@@ -741,8 +688,8 @@ public void setup() throws Exception {
741688
public void teardown() throws Exception {
742689
final LoadingCache<KafkaSourceDescriptor, MovingAvg> avgRecordSizeCache =
743690
this.avgRecordSizeCache;
744-
final LoadingCache<KafkaSourceDescriptor, KafkaLatestOffsetEstimator>
745-
latestOffsetEstimatorCache = this.latestOffsetEstimatorCache;
691+
final LoadingCache<KafkaSourceDescriptor, AtomicLong> latestOffsetEstimatorCache =
692+
this.latestOffsetEstimatorCache;
746693
final LoadingCache<KafkaSourceDescriptor, Consumer<byte[], byte[]>> pollConsumerCache =
747694
this.pollConsumerCache;
748695

sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaCommitOffsetTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@
1717
*/
1818
package org.apache.beam.sdk.io.kafka;
1919

20+
import java.time.Duration;
2021
import java.util.ArrayList;
2122
import java.util.HashMap;
2223
import java.util.List;
2324
import java.util.Map;
24-
import java.util.concurrent.TimeUnit;
2525
import org.apache.beam.sdk.coders.CannotProvideCoderException;
2626
import org.apache.beam.sdk.coders.KvCoder;
2727
import org.apache.beam.sdk.coders.StringUtf8Coder;
@@ -236,7 +236,7 @@ public synchronized void commitSync(Map<TopicPartition, OffsetAndMetadata> offse
236236
}
237237

238238
@Override
239-
public synchronized void close(long timeout, TimeUnit unit) {
239+
public synchronized void close(Duration timeout) {
240240
// Ignore closing since we're using a single consumer.
241241
}
242242
}

settings.gradle.kts

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -352,20 +352,8 @@ project(":beam-test-gha").projectDir = file(".github")
352352
include("beam-validate-runner")
353353
project(":beam-validate-runner").projectDir = file(".test-infra/validate-runner")
354354
include("com.google.api.gax.batching")
355-
include("sdks:java:io:kafka:kafka-390")
356-
findProject(":sdks:java:io:kafka:kafka-390")?.name = "kafka-390"
357-
include("sdks:java:io:kafka:kafka-312")
358-
findProject(":sdks:java:io:kafka:kafka-312")?.name = "kafka-312"
359-
include("sdks:java:io:kafka:kafka-282")
360-
findProject(":sdks:java:io:kafka:kafka-282")?.name = "kafka-282"
361-
include("sdks:java:io:kafka:kafka-251")
362-
findProject(":sdks:java:io:kafka:kafka-251")?.name = "kafka-251"
363-
include("sdks:java:io:kafka:kafka-241")
364-
findProject(":sdks:java:io:kafka:kafka-241")?.name = "kafka-241"
365-
include("sdks:java:io:kafka:kafka-231")
366-
findProject(":sdks:java:io:kafka:kafka-231")?.name = "kafka-231"
367-
include("sdks:java:io:kafka:kafka-201")
368-
findProject(":sdks:java:io:kafka:kafka-201")?.name = "kafka-201"
355+
include("sdks:java:io:kafka:kafka-391")
356+
findProject(":sdks:java:io:kafka:kafka-391")?.name = "kafka-391"
369357
include("sdks:java:managed")
370358
findProject(":sdks:java:managed")?.name = "managed"
371359
include("sdks:java:io:iceberg")

0 commit comments

Comments
 (0)