Skip to content

Commit a14a3f3

Browse files
committed
Use currentLag and position to set end offset estimate
1 parent b652544 commit a14a3f3

File tree

8 files changed

+36
-151
lines changed

8 files changed

+36
-151
lines changed

buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -629,7 +629,7 @@ class BeamModulePlugin implements Plugin<Project> {
629629
def jaxb_api_version = "2.3.3"
630630
def jsr305_version = "3.0.2"
631631
def everit_json_version = "1.14.2"
632-
def kafka_version = "2.8.2"
632+
def kafka_version = "3.9.1"
633633
def log4j2_version = "2.20.0"
634634
def nemo_version = "0.1"
635635
// [bomupgrader] determined by: io.grpc:grpc-netty, consistent with: google_cloud_platform_libraries_bom
@@ -836,7 +836,7 @@ class BeamModulePlugin implements Plugin<Project> {
836836
jupiter_api : "org.junit.jupiter:junit-jupiter-api:$jupiter_version",
837837
jupiter_engine : "org.junit.jupiter:junit-jupiter-engine:$jupiter_version",
838838
jupiter_params : "org.junit.jupiter:junit-jupiter-params:$jupiter_version",
839-
kafka : "org.apache.kafka:kafka_2.11:$kafka_version",
839+
kafka : "org.apache.kafka:kafka_2.12:$kafka_version",
840840
kafka_clients : "org.apache.kafka:kafka-clients:$kafka_version",
841841
log4j : "log4j:log4j:1.2.17",
842842
log4j_over_slf4j : "org.slf4j:log4j-over-slf4j:$slf4j_version",

sdks/java/io/kafka/build.gradle

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,7 @@ ext {
3636
}
3737

3838
def kafkaVersions = [
39-
'282': "2.8.2",
40-
'312': "3.1.2",
41-
'390': "3.9.0",
39+
'391': "3.9.1",
4240
]
4341

4442
kafkaVersions.each{k,v -> configurations.create("kafkaVersion$k")}

sdks/java/io/kafka/kafka-312/build.gradle

Lines changed: 0 additions & 24 deletions
This file was deleted.

sdks/java/io/kafka/kafka-390/build.gradle

Lines changed: 0 additions & 24 deletions
This file was deleted.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
* limitations under the License.
1717
*/
1818
project.ext {
19-
delimited="2.8.2"
20-
undelimited="282"
19+
delimited="3.9.1"
20+
undelimited="391"
2121
sdfCompatible=true
2222
}
2323

sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java

Lines changed: 27 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,14 @@
1919

2020
import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState;
2121

22-
import java.io.Closeable;
2322
import java.math.BigDecimal;
2423
import java.math.MathContext;
2524
import java.time.Duration;
2625
import java.util.Collections;
2726
import java.util.List;
2827
import java.util.Map;
2928
import java.util.Optional;
30-
import java.util.function.Supplier;
29+
import java.util.concurrent.atomic.AtomicLong;
3130
import org.apache.beam.sdk.coders.Coder;
3231
import org.apache.beam.sdk.io.kafka.KafkaIO.ReadSourceDescriptors;
3332
import org.apache.beam.sdk.io.kafka.KafkaIOUtils.MovingAvg;
@@ -49,7 +48,6 @@
4948
import org.apache.beam.sdk.transforms.splittabledofn.WatermarkEstimator;
5049
import org.apache.beam.sdk.transforms.splittabledofn.WatermarkEstimators.MonotonicallyIncreasing;
5150
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
52-
import org.apache.beam.sdk.util.ExpiringMemoizingSerializableSupplier;
5351
import org.apache.beam.sdk.util.MemoizingPerInstantiationSerializableSupplier;
5452
import org.apache.beam.sdk.util.Preconditions;
5553
import org.apache.beam.sdk.util.SerializableSupplier;
@@ -71,6 +69,7 @@
7169
import org.apache.kafka.clients.consumer.ConsumerConfig;
7270
import org.apache.kafka.clients.consumer.ConsumerRecord;
7371
import org.apache.kafka.clients.consumer.ConsumerRecords;
72+
import org.apache.kafka.common.KafkaException;
7473
import org.apache.kafka.common.PartitionInfo;
7574
import org.apache.kafka.common.TopicPartition;
7675
import org.apache.kafka.common.config.ConfigDef;
@@ -235,30 +234,12 @@ public MovingAvg load(KafkaSourceDescriptor kafkaSourceDescriptor)
235234
CacheBuilder.newBuilder()
236235
.concurrencyLevel(Runtime.getRuntime().availableProcessors())
237236
.weakValues()
238-
.removalListener(
239-
(RemovalNotification<KafkaSourceDescriptor, KafkaLatestOffsetEstimator>
240-
notification) -> {
241-
final @Nullable KafkaLatestOffsetEstimator value;
242-
if (notification.getCause() == RemovalCause.COLLECTED
243-
&& (value = notification.getValue()) != null) {
244-
value.close();
245-
}
246-
})
247237
.build(
248-
new CacheLoader<KafkaSourceDescriptor, KafkaLatestOffsetEstimator>() {
238+
new CacheLoader<KafkaSourceDescriptor, AtomicLong>() {
249239
@Override
250-
public KafkaLatestOffsetEstimator load(
251-
final KafkaSourceDescriptor sourceDescriptor) {
252-
LOG.info(
253-
"Creating Kafka consumer for offset estimation for {}",
254-
sourceDescriptor);
255-
final Map<String, Object> config =
256-
KafkaIOUtils.overrideBootstrapServersConfig(
257-
consumerConfig, sourceDescriptor);
258-
final Consumer<byte[], byte[]> consumer =
259-
consumerFactoryFn.apply(config);
260-
return new KafkaLatestOffsetEstimator(
261-
consumer, sourceDescriptor.getTopicPartition());
240+
public AtomicLong load(final KafkaSourceDescriptor sourceDescriptor) {
241+
LOG.info("Creating end offset estimator for {}", sourceDescriptor);
242+
return new AtomicLong(Long.MIN_VALUE);
262243
}
263244
}));
264245
this.pollConsumerCacheSupplier =
@@ -319,8 +300,7 @@ public Consumer<byte[], byte[]> load(
319300
private final SerializableSupplier<LoadingCache<KafkaSourceDescriptor, MovingAvg>>
320301
avgRecordSizeCacheSupplier;
321302

322-
private final SerializableSupplier<
323-
LoadingCache<KafkaSourceDescriptor, KafkaLatestOffsetEstimator>>
303+
private final SerializableSupplier<LoadingCache<KafkaSourceDescriptor, AtomicLong>>
324304
latestOffsetEstimatorCacheSupplier;
325305

326306
private final SerializableSupplier<LoadingCache<KafkaSourceDescriptor, Consumer<byte[], byte[]>>>
@@ -329,8 +309,7 @@ public Consumer<byte[], byte[]> load(
329309
private transient @MonotonicNonNull LoadingCache<KafkaSourceDescriptor, MovingAvg>
330310
avgRecordSizeCache;
331311

332-
private transient @MonotonicNonNull LoadingCache<
333-
KafkaSourceDescriptor, KafkaLatestOffsetEstimator>
312+
private transient @MonotonicNonNull LoadingCache<KafkaSourceDescriptor, AtomicLong>
334313
latestOffsetEstimatorCache;
335314

336315
private transient @MonotonicNonNull LoadingCache<KafkaSourceDescriptor, Consumer<byte[], byte[]>>
@@ -349,46 +328,6 @@ public Consumer<byte[], byte[]> load(
349328
@VisibleForTesting
350329
static final String RAW_SIZE_METRIC_PREFIX = KafkaUnboundedReader.RAW_SIZE_METRIC_PREFIX;
351330

352-
/**
353-
* A {@link GrowableOffsetRangeTracker.RangeEndEstimator} which uses a Kafka {@link Consumer} to
354-
* fetch backlog.
355-
*/
356-
private static class KafkaLatestOffsetEstimator
357-
implements GrowableOffsetRangeTracker.RangeEndEstimator, Closeable {
358-
private final Consumer<byte[], byte[]> offsetConsumer;
359-
private final Supplier<Long> offsetSupplier;
360-
361-
KafkaLatestOffsetEstimator(
362-
final Consumer<byte[], byte[]> offsetConsumer, final TopicPartition topicPartition) {
363-
this.offsetConsumer = offsetConsumer;
364-
this.offsetSupplier =
365-
new ExpiringMemoizingSerializableSupplier<>(
366-
() -> {
367-
try {
368-
return offsetConsumer
369-
.endOffsets(Collections.singleton(topicPartition))
370-
.getOrDefault(topicPartition, Long.MIN_VALUE);
371-
} catch (Throwable t) {
372-
LOG.error("Failed to get end offset for {}", topicPartition, t);
373-
return Long.MIN_VALUE;
374-
}
375-
},
376-
Duration.ofSeconds(1),
377-
Long.MIN_VALUE,
378-
Duration.ZERO);
379-
}
380-
381-
@Override
382-
public long estimate() {
383-
return offsetSupplier.get();
384-
}
385-
386-
@Override
387-
public void close() {
388-
offsetConsumer.close();
389-
}
390-
}
391-
392331
@GetInitialRestriction
393332
@RequiresNonNull({"pollConsumerCache"})
394333
public OffsetRange initialRestriction(@Element KafkaSourceDescriptor kafkaSourceDescriptor) {
@@ -500,8 +439,8 @@ public double getSize(
500439
@RequiresNonNull({"latestOffsetEstimatorCache"})
501440
public UnsplittableRestrictionTracker<OffsetRange, Long> restrictionTracker(
502441
@Element KafkaSourceDescriptor kafkaSourceDescriptor, @Restriction OffsetRange restriction) {
503-
final LoadingCache<KafkaSourceDescriptor, KafkaLatestOffsetEstimator>
504-
latestOffsetEstimatorCache = this.latestOffsetEstimatorCache;
442+
final LoadingCache<KafkaSourceDescriptor, AtomicLong> latestOffsetEstimatorCache =
443+
this.latestOffsetEstimatorCache;
505444

506445
if (restriction.getTo() < Long.MAX_VALUE) {
507446
return new UnsplittableRestrictionTracker<>(new OffsetRangeTracker(restriction));
@@ -510,9 +449,10 @@ public UnsplittableRestrictionTracker<OffsetRange, Long> restrictionTracker(
510449
// OffsetEstimators are cached for each topic-partition because they hold a stateful connection,
511450
// so we want to minimize the amount of connections that we start and track with Kafka. Another
512451
// point is that it has a memoized backlog, and this should make that more reusable estimations.
452+
final AtomicLong latestOffsetEstimator =
453+
latestOffsetEstimatorCache.getUnchecked(kafkaSourceDescriptor);
513454
return new UnsplittableRestrictionTracker<>(
514-
new GrowableOffsetRangeTracker(
515-
restriction.getFrom(), latestOffsetEstimatorCache.getUnchecked(kafkaSourceDescriptor)));
455+
new GrowableOffsetRangeTracker(restriction.getFrom(), latestOffsetEstimator::get));
516456
}
517457

518458
@ProcessElement
@@ -525,14 +465,13 @@ public ProcessContinuation processElement(
525465
throws Exception {
526466
final LoadingCache<KafkaSourceDescriptor, MovingAvg> avgRecordSizeCache =
527467
this.avgRecordSizeCache;
528-
final LoadingCache<KafkaSourceDescriptor, KafkaLatestOffsetEstimator>
529-
latestOffsetEstimatorCache = this.latestOffsetEstimatorCache;
468+
final LoadingCache<KafkaSourceDescriptor, AtomicLong> latestOffsetEstimatorCache =
469+
this.latestOffsetEstimatorCache;
530470
final LoadingCache<KafkaSourceDescriptor, Consumer<byte[], byte[]>> pollConsumerCache =
531471
this.pollConsumerCache;
532472

533473
final MovingAvg avgRecordSize = avgRecordSizeCache.get(kafkaSourceDescriptor);
534-
final KafkaLatestOffsetEstimator latestOffsetEstimator =
535-
latestOffsetEstimatorCache.get(kafkaSourceDescriptor);
474+
final AtomicLong latestOffsetEstimator = latestOffsetEstimatorCache.get(kafkaSourceDescriptor);
536475
final Consumer<byte[], byte[]> consumer = pollConsumerCache.get(kafkaSourceDescriptor);
537476
final Deserializer<K> keyDeserializerInstance =
538477
Preconditions.checkStateNotNull(this.keyDeserializerInstance);
@@ -580,6 +519,14 @@ public ProcessContinuation processElement(
580519
// Fetch the next records.
581520
final ConsumerRecords<byte[], byte[]> rawRecords = consumer.poll(remainingTimeout);
582521
final Duration elapsed = pollTimer.elapsed();
522+
try {
523+
final long position = consumer.position(topicPartition);
524+
consumer
525+
.currentLag(topicPartition)
526+
.ifPresent(lag -> latestOffsetEstimator.lazySet(position + lag));
527+
} catch (KafkaException e) {
528+
}
529+
583530
try {
584531
remainingTimeout = remainingTimeout.minus(elapsed);
585532
} catch (ArithmeticException e) {
@@ -687,7 +634,7 @@ public ProcessContinuation processElement(
687634

688635
final long estimatedBacklogBytes =
689636
(long)
690-
(BigDecimal.valueOf(latestOffsetEstimator.estimate())
637+
(BigDecimal.valueOf(latestOffsetEstimator.get())
691638
.subtract(BigDecimal.valueOf(expectedOffset), MathContext.DECIMAL128)
692639
.doubleValue()
693640
* avgRecordSize.get());
@@ -752,8 +699,8 @@ public void setup() throws Exception {
752699
public void teardown() throws Exception {
753700
final LoadingCache<KafkaSourceDescriptor, MovingAvg> avgRecordSizeCache =
754701
this.avgRecordSizeCache;
755-
final LoadingCache<KafkaSourceDescriptor, KafkaLatestOffsetEstimator>
756-
latestOffsetEstimatorCache = this.latestOffsetEstimatorCache;
702+
final LoadingCache<KafkaSourceDescriptor, AtomicLong> latestOffsetEstimatorCache =
703+
this.latestOffsetEstimatorCache;
757704
final LoadingCache<KafkaSourceDescriptor, Consumer<byte[], byte[]>> pollConsumerCache =
758705
this.pollConsumerCache;
759706

sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaCommitOffsetTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@
1717
*/
1818
package org.apache.beam.sdk.io.kafka;
1919

20+
import java.time.Duration;
2021
import java.util.ArrayList;
2122
import java.util.HashMap;
2223
import java.util.List;
2324
import java.util.Map;
24-
import java.util.concurrent.TimeUnit;
2525
import org.apache.beam.sdk.coders.CannotProvideCoderException;
2626
import org.apache.beam.sdk.coders.KvCoder;
2727
import org.apache.beam.sdk.coders.StringUtf8Coder;
@@ -236,7 +236,7 @@ public synchronized void commitSync(Map<TopicPartition, OffsetAndMetadata> offse
236236
}
237237

238238
@Override
239-
public synchronized void close(long timeout, TimeUnit unit) {
239+
public synchronized void close(Duration timeout) {
240240
// Ignore closing since we're using a single consumer.
241241
}
242242
}

settings.gradle.kts

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -344,20 +344,8 @@ project(":beam-test-gha").projectDir = file(".github")
344344
include("beam-validate-runner")
345345
project(":beam-validate-runner").projectDir = file(".test-infra/validate-runner")
346346
include("com.google.api.gax.batching")
347-
include("sdks:java:io:kafka:kafka-390")
348-
findProject(":sdks:java:io:kafka:kafka-390")?.name = "kafka-390"
349-
include("sdks:java:io:kafka:kafka-312")
350-
findProject(":sdks:java:io:kafka:kafka-312")?.name = "kafka-312"
351-
include("sdks:java:io:kafka:kafka-282")
352-
findProject(":sdks:java:io:kafka:kafka-282")?.name = "kafka-282"
353-
include("sdks:java:io:kafka:kafka-251")
354-
findProject(":sdks:java:io:kafka:kafka-251")?.name = "kafka-251"
355-
include("sdks:java:io:kafka:kafka-241")
356-
findProject(":sdks:java:io:kafka:kafka-241")?.name = "kafka-241"
357-
include("sdks:java:io:kafka:kafka-231")
358-
findProject(":sdks:java:io:kafka:kafka-231")?.name = "kafka-231"
359-
include("sdks:java:io:kafka:kafka-201")
360-
findProject(":sdks:java:io:kafka:kafka-201")?.name = "kafka-201"
347+
include("sdks:java:io:kafka:kafka-391")
348+
findProject(":sdks:java:io:kafka:kafka-391")?.name = "kafka-391"
361349
include("sdks:java:managed")
362350
findProject(":sdks:java:managed")?.name = "managed"
363351
include("sdks:java:io:iceberg")

0 commit comments

Comments
 (0)