Skip to content

Commit a33006a

Browse files
authored
Add per-database throughput attribution metrics to EventProducer (#1002)
1 parent 44fc9b6 commit a33006a

File tree

3 files changed

+176
-8
lines changed

3 files changed

+176
-8
lines changed

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ subprojects {
144144
tasks.withType(JavaCompile) {
145145
// Skipping 'deprecation' since pegasus generates problematic files and 'fallthrough' since it can't be suppressed
146146
// Xlint:all - [deprecation, fallthrough]
147-
options.compilerArgs = ["-Xlint:cast,classfile,dep-ann,divzero,empty,finally,options,overrides,path,processing,rawtypes,serial,static,try,unchecked,varargs", "-Werror"]
147+
options.compilerArgs = ["-Xlint:cast,classfile,dep-ann,divzero,empty,finally,options,overrides,path,processing,rawtypes,serial,static,try,unchecked,varargs", "-Xlint:-options", "-Werror"]
148148
}
149149
}
150150

datastream-server/src/main/java/com/linkedin/datastream/server/EventProducer.java

Lines changed: 69 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
*/
66
package com.linkedin.datastream.server;
77

8+
import java.net.URI;
9+
import java.net.URISyntaxException;
810
import java.time.Duration;
911
import java.time.Instant;
1012
import java.util.ArrayList;
@@ -52,6 +54,18 @@ public class EventProducer implements DatastreamEventProducer {
5254
public static final String CONFIG_FLUSH_INTERVAL_MS = "flushIntervalMs";
5355
public static final String CONFIG_ENABLE_PER_TOPIC_METRICS = "enablePerTopicMetrics";
5456
public static final String CONFIG_ENABLE_PER_TOPIC_EVENT_LATENCY_METRICS = "enablePerTopicEventLatencyMetrics";
57+
/**
58+
* When enabled, emits per-source-database throughput attribution metrics keyed as
59+
* {@code EventProducer.db.<databaseName>.bytesProducedRate} and
60+
* {@code EventProducer.db.<databaseName>.eventProduceRate}.
61+
* Applies only to CDC connectors whose source URI uses a single-slash scheme
62+
* (e.g. {@code espresso:/}, {@code mysql:/}, {@code tidb:/}).
63+
* Double-slash URIs (e.g. {@code kafka://}) produce no database metrics.
64+
*
65+
* <p><b>Cardinality warning:</b> each distinct database name creates a new metric series.
66+
* Enable only when the set of source databases is bounded and well-understood.
67+
*/
68+
public static final String CONFIG_ENABLE_THROUGHPUT_METRICS = "enableThroughputAttributionMetrics";
5569

5670
// Default flush interval, It is intentionally kept at low frequency. If a particular connectors wants
5771
// a more frequent flush (high traffic connectors), it can perform that on it's own.
@@ -79,7 +93,9 @@ public class EventProducer implements DatastreamEventProducer {
7993
private static final String EVENTS_PRODUCED_OUTSIDE_SLA = "eventsProducedOutsideSla";
8094
private static final String EVENTS_PRODUCED_OUTSIDE_ALTERNATE_SLA = "eventsProducedOutsideAlternateSla";
8195
private static final String DROPPED_SENT_FROM_SERIALIZATION_ERROR = "droppedSentFromSerializationError";
96+
static final String BYTES_PRODUCED_RATE = "bytesProducedRate";
8297
private static final String AGGREGATE = "aggregate";
98+
8399
private static final String DEFAULT_AVAILABILITY_THRESHOLD_SLA_MS = "60000"; // 1 minute
84100
private static final String DEFAULT_AVAILABILITY_THRESHOLD_ALTERNATE_SLA_MS = "180000"; // 3 minutes
85101
private static final String DEFAULT_WARN_LOG_LATENCY_ENABLED = "false";
@@ -109,6 +125,9 @@ public class EventProducer implements DatastreamEventProducer {
109125
private final boolean _skipMessageOnSerializationErrors;
110126
private final boolean _enablePerTopicMetrics;
111127
private final boolean _enablePerTopicEventLatencyMetrics;
128+
private final boolean _enableThroughputMetrics;
129+
// Cached source database name parsed from the connection string at construction time (null for non-CDC sources)
130+
private final String _sourceDatabase;
112131
private final Duration _flushInterval;
113132
private final Function<DatastreamTask, Set<String>> _throughputViolatingTopicsProvider;
114133

@@ -188,6 +207,12 @@ public EventProducer(DatastreamTask task, TransportProvider transportProvider, C
188207
Boolean.parseBoolean(config.getProperty(CONFIG_ENABLE_PER_TOPIC_EVENT_LATENCY_METRICS,
189208
Boolean.FALSE.toString()));
190209

210+
_enableThroughputMetrics =
211+
Boolean.parseBoolean(config.getProperty(CONFIG_ENABLE_THROUGHPUT_METRICS, Boolean.FALSE.toString()));
212+
213+
String[] sourceParts = getSourcePathParts();
214+
_sourceDatabase = (sourceParts != null && sourceParts.length > 1) ? sourceParts[1] : null;
215+
191216
_logger.info("Created event producer with customCheckpointing={}", customCheckpointing);
192217

193218
_dynamicMetricsManager = DynamicMetricsManager.getInstance();
@@ -281,10 +306,17 @@ private DatastreamRecordMetadata helperSendOrBroadcast(DatastreamProducerRecord
281306
record.setEventsSendTimestamp(System.currentTimeMillis());
282307
long recordEventsSourceTimestamp = record.getEventsSourceTimestamp();
283308
long recordEventsSendTimestamp = record.getEventsSendTimestamp().orElse(0L);
309+
final long numSerializedBytes = record.getEvents().stream()
310+
.mapToLong(e -> {
311+
long keySize = e.key().filter(k -> k instanceof byte[]).map(k -> (long) ((byte[]) k).length).orElse(0L);
312+
long valSize = e.value().filter(v -> v instanceof byte[]).map(v -> (long) ((byte[]) v).length).orElse(0L);
313+
return keySize + valSize;
314+
})
315+
.sum();
284316
if (isBroadcast) {
285317
broadcastMetadata = _transportProvider.broadcast(destination, record,
286318
(metadata, exception) -> onSendCallback(metadata, exception, sendEventCallback, recordEventsSourceTimestamp,
287-
recordEventsSendTimestamp));
319+
recordEventsSendTimestamp, numSerializedBytes));
288320
_logger.debug("Broadcast completed with {}", broadcastMetadata);
289321
if (broadcastMetadata.isMessageSerializationError()) {
290322
_logger.warn("Broadcast of record {} to destination {} failed because of serialization error.",
@@ -293,7 +325,7 @@ private DatastreamRecordMetadata helperSendOrBroadcast(DatastreamProducerRecord
293325
} else {
294326
_transportProvider.send(destination, record,
295327
(metadata, exception) -> onSendCallback(metadata, exception, sendEventCallback, recordEventsSourceTimestamp,
296-
recordEventsSendTimestamp));
328+
recordEventsSendTimestamp, numSerializedBytes));
297329
}
298330
} catch (Exception e) {
299331
String errorMessage = String.format("Failed to send the event %s exception %s", record, e);
@@ -365,7 +397,8 @@ private void performSlaRelatedLogging(DatastreamRecordMetadata metadata, long ev
365397
* per DatastreamProducerRecord (i.e. by the number of events within the record), only increment all metrics by 1
366398
* to avoid overcounting.
367399
*/
368-
private void reportMetrics(DatastreamRecordMetadata metadata, long eventsSourceTimestamp, long eventsSendTimestamp) {
400+
private void reportMetrics(DatastreamRecordMetadata metadata, long eventsSourceTimestamp, long eventsSendTimestamp,
401+
long numBytes) {
369402
// If per-topic metrics are enabled, use topic as key for metrics; else, use datastream name as the key
370403
String datastreamName = getDatastreamName();
371404

@@ -413,6 +446,7 @@ private void reportMetrics(DatastreamRecordMetadata metadata, long eventsSourceT
413446
}
414447
_dynamicMetricsManager.createOrUpdateMeter(MODULE, AGGREGATE, EVENT_PRODUCE_RATE, 1);
415448
_dynamicMetricsManager.createOrUpdateMeter(MODULE, _datastreamTask.getConnectorType(), EVENT_PRODUCE_RATE, 1);
449+
reportThroughputAttributionMetrics(numBytes);
416450
}
417451

418452
/**
@@ -424,7 +458,7 @@ private void reportMetrics(DatastreamRecordMetadata metadata, long eventsSourceT
424458
* to avoid overcounting.
425459
*/
426460
private void reportMetricsForThroughputViolatingTopics(DatastreamRecordMetadata metadata, long eventsSourceTimestamp,
427-
long eventsSendTimestamp) {
461+
long eventsSendTimestamp, long numBytes) {
428462
String topicOrDatastreamName = _enablePerTopicMetrics ? metadata.getTopic() : getDatastreamName();
429463
// Treat all events within this record equally (assume same timestamp)
430464
if (eventsSourceTimestamp > 0) {
@@ -457,6 +491,7 @@ private void reportMetricsForThroughputViolatingTopics(DatastreamRecordMetadata
457491
}
458492
_dynamicMetricsManager.createOrUpdateMeter(MODULE, AGGREGATE, EVENT_PRODUCE_RATE, 1);
459493
_dynamicMetricsManager.createOrUpdateMeter(MODULE, _datastreamTask.getConnectorType(), EVENT_PRODUCE_RATE, 1);
494+
reportThroughputAttributionMetrics(numBytes);
460495
}
461496

462497
// Report Event Latency metrics for aggregate, connector and topic/datastream
@@ -492,7 +527,7 @@ private void reportSendLatencyMetrics(DatastreamRecordMetadata metadata, long se
492527
}
493528

494529
private void onSendCallback(DatastreamRecordMetadata metadata, Exception exception, SendCallback sendCallback,
495-
long eventSourceTimestamp, long eventSendTimestamp) {
530+
long eventSourceTimestamp, long eventSendTimestamp, long numBytes) {
496531

497532
SendFailedException sendFailedException = null;
498533

@@ -505,9 +540,9 @@ private void onSendCallback(DatastreamRecordMetadata metadata, Exception excepti
505540
// Reporting separate metrics for throughput violating topics.
506541

507542
if (_throughputViolatingTopicsProvider.apply(_datastreamTask).contains(metadata.getUndecoratedTopic())) {
508-
reportMetricsForThroughputViolatingTopics(metadata, eventSourceTimestamp, eventSendTimestamp);
543+
reportMetricsForThroughputViolatingTopics(metadata, eventSourceTimestamp, eventSendTimestamp, numBytes);
509544
} else {
510-
reportMetrics(metadata, eventSourceTimestamp, eventSendTimestamp);
545+
reportMetrics(metadata, eventSourceTimestamp, eventSendTimestamp, numBytes);
511546
}
512547
}
513548
} catch (Exception e) {
@@ -601,10 +636,36 @@ public String toString() {
601636
return String.format("EventProducer producerId=%d", _producerId);
602637
}
603638

639+
private void reportThroughputAttributionMetrics(long numBytes) {
640+
if (!_enableThroughputMetrics) {
641+
return;
642+
}
643+
if (_sourceDatabase != null) {
644+
_dynamicMetricsManager.createOrUpdateMeter(MODULE, "db." + _sourceDatabase, BYTES_PRODUCED_RATE, numBytes);
645+
_dynamicMetricsManager.createOrUpdateMeter(MODULE, "db." + _sourceDatabase, EVENT_PRODUCE_RATE, 1);
646+
}
647+
_dynamicMetricsManager.createOrUpdateMeter(MODULE, AGGREGATE, BYTES_PRODUCED_RATE, numBytes);
648+
_dynamicMetricsManager.createOrUpdateMeter(MODULE, _datastreamTask.getConnectorType(), BYTES_PRODUCED_RATE, numBytes);
649+
}
650+
604651
private String getDatastreamName() {
605652
return _datastreamTask.getDatastreams().get(0).getName();
606653
}
607654

655+
// Returns path segments ["CLUSTER", "DATABASE", "TABLE"] for CDC single-slash URIs, null for BMM double-slash URIs.
656+
// Consistent with MySqlKafkaSource, TiDBKafkaSource, and EspressoSource parsing in brooklin-li-common.
657+
private String[] getSourcePathParts() {
658+
try {
659+
URI uri = new URI(_datastreamTask.getDatastreamSource().getConnectionString());
660+
if (uri.getAuthority() != null) {
661+
return null; // double-slash URI (e.g. kafka://host/topic) — no cluster/database segments
662+
}
663+
return uri.getPath().substring(1).split("/");
664+
} catch (URISyntaxException e) {
665+
return null;
666+
}
667+
}
668+
608669
/**
609670
* Get the list of metrics maintained by the event producer
610671
*/
@@ -615,6 +676,7 @@ public static List<BrooklinMetricInfo> getMetricInfos() {
615676
metrics.add(new BrooklinCounterInfo(METRICS_PREFIX + EVENTS_PRODUCED_WITHIN_ALTERNATE_SLA));
616677
metrics.add(new BrooklinCounterInfo(METRICS_PREFIX + TOTAL_EVENTS_PRODUCED));
617678
metrics.add(new BrooklinMeterInfo(METRICS_PREFIX + EVENT_PRODUCE_RATE));
679+
metrics.add(new BrooklinMeterInfo(METRICS_PREFIX + BYTES_PRODUCED_RATE));
618680
metrics.add(new BrooklinCounterInfo(METRICS_PREFIX + EVENTS_PRODUCED_OUTSIDE_SLA));
619681
metrics.add(new BrooklinCounterInfo(METRICS_PREFIX + EVENTS_PRODUCED_OUTSIDE_ALTERNATE_SLA));
620682
metrics.add(new BrooklinCounterInfo(METRICS_PREFIX + DROPPED_SENT_FROM_SERIALIZATION_ERROR));

datastream-server/src/test/java/com/linkedin/datastream/server/TestEventProducer.java

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.testng.annotations.BeforeMethod;
1717
import org.testng.annotations.Test;
1818

19+
import com.codahale.metrics.Meter;
1920
import com.codahale.metrics.MetricRegistry;
2021

2122
import com.linkedin.datastream.common.BrooklinEnvelope;
@@ -155,6 +156,111 @@ public void testPerDatastreamMetrics() {
155156
metrics.getMetric("EventProducer." + datastreamName + "." + EventProducer.EVENTS_SEND_LATENCY_MS_STRING));
156157
}
157158

159+
@Test
160+
public void testThroughputAttributionMetrics() {
161+
String datastreamName = "datastream-testThroughputAttributionMetrics";
162+
Datastream datastream = DatastreamTestUtils.createDatastreams(DummyConnector.CONNECTOR_TYPE, datastreamName)[0];
163+
datastream.getSource().setConnectionString("mysql:/myhost/testDatabase/myTable");
164+
DatastreamTaskImpl task = new DatastreamTaskImpl(Collections.singletonList(datastream));
165+
166+
byte[] key = new byte[10];
167+
byte[] value = new byte[20];
168+
long expectedBytes = key.length + value.length; // 30
169+
170+
String someTopicName = "someTopicName";
171+
TransportProvider transport = new NoOpTransportProviderAdminFactory.NoOpTransportProvider() {
172+
@Override
173+
public void send(String destination, DatastreamProducerRecord record, SendCallback onComplete) {
174+
DatastreamRecordMetadata metadata =
175+
new DatastreamRecordMetadata(record.getCheckpoint(), someTopicName, record.getPartition().orElse(0));
176+
onComplete.onCompletion(metadata, null);
177+
}
178+
};
179+
180+
Properties props = new Properties();
181+
props.put(EventProducer.CONFIG_ENABLE_THROUGHPUT_METRICS, Boolean.TRUE.toString());
182+
EventProducer eventProducer =
183+
new EventProducer(task, transport, new NoOpCheckpointProvider(), props, false);
184+
185+
DatastreamProducerRecordBuilder builder = new DatastreamProducerRecordBuilder();
186+
builder.setPartition(0);
187+
builder.setSourceCheckpoint("0");
188+
builder.setEventsSourceTimestamp(System.currentTimeMillis());
189+
builder.addEvent(new BrooklinEnvelope(key, value, null, new HashMap<>()));
190+
eventProducer.send(builder.build(), (m, e) -> { });
191+
192+
DynamicMetricsManager metrics = DynamicMetricsManager.getInstance();
193+
String connectorType = DummyConnector.CONNECTOR_TYPE;
194+
195+
Meter dbBytesRate = (Meter) metrics.getMetric("EventProducer.db.testDatabase." + EventProducer.BYTES_PRODUCED_RATE);
196+
Assert.assertNotNull(dbBytesRate, "Per-database bytesProducedRate should exist");
197+
Assert.assertEquals(dbBytesRate.getCount(), expectedBytes);
198+
199+
Meter dbEventRate = (Meter) metrics.getMetric("EventProducer.db.testDatabase.eventProduceRate");
200+
Assert.assertNotNull(dbEventRate, "Per-database eventProduceRate should exist");
201+
Assert.assertEquals(dbEventRate.getCount(), 1);
202+
203+
Meter aggBytesRate = (Meter) metrics.getMetric("EventProducer.aggregate." + EventProducer.BYTES_PRODUCED_RATE);
204+
Assert.assertNotNull(aggBytesRate, "Aggregate bytesProducedRate should exist");
205+
Assert.assertEquals(aggBytesRate.getCount(), expectedBytes);
206+
207+
Meter connectorBytesRate = (Meter) metrics.getMetric("EventProducer." + connectorType + "." + EventProducer.BYTES_PRODUCED_RATE);
208+
Assert.assertNotNull(connectorBytesRate, "Connector-type bytesProducedRate should exist");
209+
Assert.assertEquals(connectorBytesRate.getCount(), expectedBytes);
210+
}
211+
212+
@Test
213+
public void testThroughputMetricsDisabledByDefault() {
214+
String datastreamName = "datastream-testThroughputMetricsDisabled";
215+
Datastream datastream = DatastreamTestUtils.createDatastreams(DummyConnector.CONNECTOR_TYPE, datastreamName)[0];
216+
datastream.getSource().setConnectionString("mysql:/myhost/someDatabase/someTable");
217+
DatastreamTaskImpl task = new DatastreamTaskImpl(Collections.singletonList(datastream));
218+
219+
String someTopicName = "someTopicName";
220+
TransportProvider transport = new NoOpTransportProviderAdminFactory.NoOpTransportProvider() {
221+
@Override
222+
public void send(String destination, DatastreamProducerRecord record, SendCallback onComplete) {
223+
DatastreamRecordMetadata metadata =
224+
new DatastreamRecordMetadata(record.getCheckpoint(), someTopicName, record.getPartition().orElse(0));
225+
onComplete.onCompletion(metadata, null);
226+
}
227+
};
228+
229+
// Do NOT set CONFIG_ENABLE_THROUGHPUT_METRICS — it should default to false
230+
EventProducer eventProducer =
231+
new EventProducer(task, transport, new NoOpCheckpointProvider(), new Properties(), false);
232+
233+
eventProducer.send(createDatastreamProducerRecord(), (m, e) -> { });
234+
235+
DynamicMetricsManager metrics = DynamicMetricsManager.getInstance();
236+
Assert.assertNull(metrics.getMetric("EventProducer.db.someDatabase." + EventProducer.BYTES_PRODUCED_RATE),
237+
"bytesProducedRate should not exist when throughput metrics are disabled");
238+
Assert.assertNull(metrics.getMetric("EventProducer.aggregate." + EventProducer.BYTES_PRODUCED_RATE),
239+
"aggregate bytesProducedRate should not exist when throughput metrics are disabled");
240+
}
241+
242+
@Test
243+
public void testNoDatabaseMetricForBmmUri() {
244+
Datastream datastream = DatastreamTestUtils.createDatastreams(DummyConnector.CONNECTOR_TYPE, "datastream-testBmmUri")[0];
245+
// BMM source uses double-slash URI — no database segment should be extracted
246+
datastream.getSource().setConnectionString("kafka://broker:9092/someTopic");
247+
DatastreamTaskImpl task = new DatastreamTaskImpl(Collections.singletonList(datastream));
248+
249+
Properties props = new Properties();
250+
props.put(EventProducer.CONFIG_ENABLE_THROUGHPUT_METRICS, Boolean.TRUE.toString());
251+
EventProducer eventProducer =
252+
new EventProducer(task, new NoOpTransportProviderAdminFactory.NoOpTransportProvider(),
253+
new NoOpCheckpointProvider(), props, false);
254+
255+
eventProducer.send(createDatastreamProducerRecord(), (m, e) -> { });
256+
257+
DynamicMetricsManager metrics = DynamicMetricsManager.getInstance();
258+
Assert.assertNull(metrics.getMetric("EventProducer.db.someTopic." + EventProducer.BYTES_PRODUCED_RATE),
259+
"No per-database metric should exist for BMM double-slash URI");
260+
Assert.assertNotNull(metrics.getMetric("EventProducer.aggregate." + EventProducer.BYTES_PRODUCED_RATE),
261+
"Aggregate bytesProducedRate should still exist for BMM");
262+
}
263+
158264
private DatastreamProducerRecord createDatastreamProducerRecord() {
159265
return createDatastreamProducerRecord(0, "0", 1);
160266
}

0 commit comments

Comments
 (0)