Skip to content

Commit d43b1d5

Browse files
committed
Collect kafka metrics only for workers who own the partitions
1 parent b0d293a commit d43b1d5

File tree

1 file changed

+34
-21
lines changed

1 file changed

+34
-21
lines changed

src/dataflow/src/source/kafka.rs

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ lazy_static! {
5353
/// Per-Kafka source metrics.
5454
pub struct SourceMetrics {
5555
operator_scheduled_counter: IntCounter,
56-
capability: UIntGauge,
56+
capability: UIntGauge
5757
}
5858

5959
impl SourceMetrics {
@@ -369,13 +369,17 @@ impl DataPlaneInfo {
369369
let refresh = self.refresh_metadata_info.clone();
370370
let id = self.source_id.clone();
371371
let topic = self.topic_name.clone();
372+
let worker_id = self.worker_id;
373+
let worker_count = self.worker_count;
372374
thread::spawn(move || {
373375
metadata_fetch(
374376
timestamping_stopped,
375377
consumer,
376378
refresh,
377379
&id,
378380
&topic,
381+
worker_id,
382+
worker_count,
379383
metadata_refresh_frequency,
380384
)
381385
});
@@ -421,8 +425,6 @@ impl DataPlaneInfo {
421425
}
422426

423427
/// Returns true if this worker is responsible for this partition
424-
/// If multi-worker reading is not enabled, this worker is *always* responsible for the
425-
/// partition
426428
/// Ex: if pid=0 and worker_id = 0, then true
427429
/// if pid=1 and worker_id = 0, then false
428430
fn has_partition(&self, partition_id: i32) -> bool {
@@ -665,6 +667,8 @@ struct ControlPlaneInfo {
665667
start_offset: MzOffset,
666668
/// Source Type (Real-time or BYO)
667669
source_type: Consistency,
670+
/// Number of records processed since capability was last downgraded
671+
record_count_since_downgrade: u64,
668672
}
669673

670674
impl ControlPlaneInfo {
@@ -681,6 +685,7 @@ impl ControlPlaneInfo {
681685
start_offset,
682686
source_type: consistency,
683687
time_since_downgrade: Instant::now(),
688+
record_count_since_downgrade: 0,
684689
}
685690
}
686691

@@ -766,6 +771,8 @@ fn metadata_fetch(
766771
partition_count: Arc<Mutex<Option<i32>>>,
767772
id: &str,
768773
topic: &str,
774+
worker_id: i32,
775+
worker_count: i32,
769776
wait: Duration,
770777
) {
771778
debug!(
@@ -811,26 +818,29 @@ fn metadata_fetch(
811818
// Upgrade partition metrics
812819
for p in 0..new_partition_count {
813820
let pid = p.try_into().unwrap();
814-
match consumer.fetch_watermarks(&topic, pid, Duration::from_secs(1)) {
815-
Ok((_, high)) => {
816-
if let Some(max_available_offset) =
817-
partition_kafka_metadata.get_mut(&pid)
818-
{
819-
max_available_offset.set(high)
820-
} else {
821-
let max_offset = MAX_AVAILABLE_OFFSET.with_label_values(&[
822-
topic,
823-
&id,
824-
&pid.to_string(),
825-
]);
826-
max_offset.set(high);
827-
partition_kafka_metadata.insert(pid, max_offset);
821+
// Only check metadata updates for partitions that the worker owns
822+
if (pid % worker_count) == worker_id {
823+
match consumer.fetch_watermarks(&topic, pid, Duration::from_secs(1)) {
824+
Ok((_, high)) => {
825+
if let Some(max_available_offset) =
826+
partition_kafka_metadata.get_mut(&pid)
827+
{
828+
max_available_offset.set(high)
829+
} else {
830+
let max_offset = MAX_AVAILABLE_OFFSET.with_label_values(&[
831+
topic,
832+
&id,
833+
&pid.to_string(),
834+
]);
835+
max_offset.set(high);
836+
partition_kafka_metadata.insert(pid, max_offset);
837+
}
828838
}
839+
Err(e) => warn!(
840+
"error loading watermarks topic={} partition={} error={}",
841+
topic, p, e
842+
),
829843
}
830-
Err(e) => warn!(
831-
"error loading watermarks topic={} partition={} error={}",
832-
topic, p, e
833-
),
834844
}
835845
}
836846

@@ -1029,6 +1039,7 @@ where
10291039
&mut dp_info.partition_metrics.get_mut(&partition).unwrap();
10301040
partition_metrics.offset_ingested.set(offset.offset);
10311041
partition_metrics.messages_ingested.inc();
1042+
cp_info.record_count_since_downgrade+=1;
10321043
}
10331044
}
10341045

@@ -1260,6 +1271,7 @@ fn downgrade_capability(
12601271
if changed && min > 0 {
12611272
dp_info.source_metrics.capability.set(min);
12621273
cap.downgrade(&(&min + 1));
1274+
cp_info.record_count_since_downgrade = 0;
12631275
cp_info.last_closed_ts = min;
12641276
}
12651277
} else {
@@ -1274,6 +1286,7 @@ fn downgrade_capability(
12741286
cap.downgrade(&(&ts + 1));
12751287
}
12761288
cp_info.time_since_downgrade = Instant::now();
1289+
cp_info.record_count_since_downgrade = 0;
12771290
}
12781291
}
12791292
}

0 commit comments

Comments
 (0)