-
Notifications
You must be signed in to change notification settings - Fork 25.7k
Add connection failure metrics in RemoteConnectionStrategy #137406
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
345acad
3acd2df
fdd31bd
ababa08
4f6263d
97bc022
96114ac
10cdd3b
af436be
5d6d9fd
d06dfbd
5a9fc0d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,8 @@ | |
| import org.elasticsearch.common.util.concurrent.AbstractRunnable; | ||
| import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException; | ||
| import org.elasticsearch.core.Nullable; | ||
| import org.elasticsearch.telemetry.TelemetryProvider; | ||
| import org.elasticsearch.telemetry.metric.LongCounter; | ||
| import org.elasticsearch.threadpool.ThreadPool; | ||
|
|
||
| import java.io.Closeable; | ||
|
|
@@ -79,6 +81,11 @@ public Writeable.Reader<RemoteConnectionInfo.ModeInfo> getReader() { | |
| private List<ActionListener<Void>> listeners = new ArrayList<>(); | ||
| private final AtomicBoolean initialConnectionAttempted = new AtomicBoolean(false); | ||
|
|
||
| static final String INITIAL_CONNECTION_ATTEMPT_FAILURES_COUNTER_NAME = "es.projects.linked.connections.initial.error.total"; | ||
| static final String RECONNECTION_ATTEMPT_FAILURES_COUNTER_NAME = "es.projects.linked.connections.reconnect.error.total"; | ||
| private static LongCounter initialConnectionAttemptFailures; | ||
| private static LongCounter reconnectAttemptFailures; | ||
|
|
||
| protected final TransportService transportService; | ||
| protected final RemoteConnectionManager connectionManager; | ||
| protected final ProjectId originProjectId; | ||
|
|
@@ -92,9 +99,27 @@ public Writeable.Reader<RemoteConnectionInfo.ModeInfo> getReader() { | |
| this.transportService = transportService; | ||
| this.connectionManager = connectionManager; | ||
| this.maxPendingConnectionListeners = config.maxPendingConnectionListeners(); | ||
| registerMetrics(transportService.getTelemetryProvider()); | ||
| connectionManager.addListener(this); | ||
| } | ||
|
|
||
| private static synchronized void registerMetrics(TelemetryProvider telemetryProvider) { | ||
| final var meterRegistry = telemetryProvider == null ? null : telemetryProvider.getMeterRegistry(); | ||
| if (initialConnectionAttemptFailures != null || meterRegistry == null) { | ||
| return; | ||
| } | ||
| initialConnectionAttemptFailures = meterRegistry.registerLongCounter( | ||
| INITIAL_CONNECTION_ATTEMPT_FAILURES_COUNTER_NAME, | ||
| "linked project initial connection attempt failure count", | ||
| "count" | ||
| ); | ||
| reconnectAttemptFailures = meterRegistry.registerLongCounter( | ||
| RECONNECTION_ATTEMPT_FAILURES_COUNTER_NAME, | ||
| "linked project reconnection attempt failure count", | ||
| "count" | ||
| ); | ||
| } | ||
|
|
||
| static ConnectionProfile buildConnectionProfile(LinkedProjectConfig config, String transportProfile) { | ||
| ConnectionProfile.Builder builder = new ConnectionProfile.Builder().setConnectTimeout(config.transportConnectTimeout()) | ||
| .setHandshakeTimeout(config.transportConnectTimeout()) | ||
|
|
@@ -221,7 +246,8 @@ private void connectionAttemptCompleted(@Nullable Exception e) { | |
| logger.debug(msgSupplier); | ||
| } else { | ||
| logger.warn(msgSupplier, e); | ||
| // TODO: ES-12695: Increment either the initial or retry connection failure metric. | ||
| final var counter = isInitialAttempt ? initialConnectionAttemptFailures : reconnectAttemptFailures; | ||
| counter.increment(); | ||
|
||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,8 @@ | |
| import org.elasticsearch.common.util.concurrent.ThreadContext; | ||
| import org.elasticsearch.core.Strings; | ||
| import org.elasticsearch.core.TimeValue; | ||
| import org.elasticsearch.telemetry.InstrumentType; | ||
| import org.elasticsearch.telemetry.RecordingMeterRegistry; | ||
| import org.elasticsearch.test.ESTestCase; | ||
| import org.elasticsearch.test.EnumSerializationTestUtils; | ||
| import org.elasticsearch.test.MockLog; | ||
|
|
@@ -34,6 +36,8 @@ | |
| import static org.elasticsearch.transport.RemoteClusterSettings.SniffConnectionStrategySettings.REMOTE_CLUSTER_SEEDS; | ||
| import static org.elasticsearch.transport.RemoteClusterSettings.toConfig; | ||
| import static org.elasticsearch.transport.RemoteConnectionStrategy.buildConnectionProfile; | ||
| import static org.hamcrest.Matchers.equalTo; | ||
| import static org.hamcrest.Matchers.hasSize; | ||
| import static org.mockito.Mockito.mock; | ||
|
|
||
| public class RemoteConnectionStrategyTests extends ESTestCase { | ||
|
|
@@ -194,7 +198,7 @@ public void testConnectionStrategySerialization() { | |
| value = "org.elasticsearch.transport.RemoteConnectionStrategyTests.FakeConnectionStrategy:DEBUG", | ||
| reason = "logging verification" | ||
| ) | ||
| public void testConnectionAttemptLogging() { | ||
| public void testConnectionAttemptMetricsAndLogging() { | ||
| final var originProjectId = randomUniqueProjectId(); | ||
| final var linkedProjectId = randomUniqueProjectId(); | ||
| final var alias = randomAlphanumericOfLength(10); | ||
|
|
@@ -208,16 +212,21 @@ public void testConnectionAttemptLogging() { | |
| new ClusterConnectionManager(TestProfiles.LIGHT_PROFILE, mock(Transport.class), threadContext) | ||
| ) | ||
| ) { | ||
| assert transportService.getTelemetryProvider() != null; | ||
| final var meterRegistry = transportService.getTelemetryProvider().getMeterRegistry(); | ||
| assert meterRegistry instanceof RecordingMeterRegistry; | ||
| final var metricRecorder = ((RecordingMeterRegistry) meterRegistry).getRecorder(); | ||
|
Comment on lines
+216
to
+219
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related to my other comments about static fields. If we create two |
||
|
|
||
| for (boolean shouldConnectFail : new boolean[] { true, false }) { | ||
| for (boolean isIntialConnectAttempt : new boolean[] { true, false }) { | ||
| for (boolean isInitialConnectAttempt : new boolean[] { true, false }) { | ||
| final var strategy = new FakeConnectionStrategy( | ||
| originProjectId, | ||
| linkedProjectId, | ||
| alias, | ||
| transportService, | ||
| connectionManager | ||
| ); | ||
| if (isIntialConnectAttempt == false) { | ||
| if (isInitialConnectAttempt == false) { | ||
| waitForConnect(strategy); | ||
| } | ||
| strategy.setShouldConnectFail(shouldConnectFail); | ||
|
|
@@ -228,7 +237,7 @@ public void testConnectionAttemptLogging() { | |
| shouldConnectFail ? "failed to connect" : "successfully connected", | ||
| linkedProjectId, | ||
| alias, | ||
| isIntialConnectAttempt ? "the initial connection" : "a reconnection" | ||
| isInitialConnectAttempt ? "the initial connection" : "a reconnection" | ||
| ); | ||
| assertThatLogger(() -> { | ||
| if (shouldConnectFail) { | ||
|
|
@@ -243,12 +252,21 @@ public void testConnectionAttemptLogging() { | |
| + expectedLogLevel | ||
| + " after a " | ||
| + (shouldConnectFail ? "failed" : "successful") | ||
| + (isIntialConnectAttempt ? " initial connection attempt" : " reconnection attempt"), | ||
| + (isInitialConnectAttempt ? " initial connection attempt" : " reconnection attempt"), | ||
| strategy.getClass().getCanonicalName(), | ||
| expectedLogLevel, | ||
| expectedLogMessage | ||
| ) | ||
| ); | ||
| if (shouldConnectFail) { | ||
| metricRecorder.collect(); | ||
| final var counterName = isInitialConnectAttempt | ||
| ? RemoteConnectionStrategy.INITIAL_CONNECTION_ATTEMPT_FAILURES_COUNTER_NAME | ||
| : RemoteConnectionStrategy.RECONNECTION_ATTEMPT_FAILURES_COUNTER_NAME; | ||
| final var measurements = metricRecorder.getMeasurements(InstrumentType.LONG_COUNTER, counterName); | ||
| assertThat(measurements, hasSize(1)); | ||
| assertThat(measurements.getFirst().getLong(), equalTo(1L)); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The metrics registration should be node wide, not per Java instance. I suggest we register the metrics in
RemoteClusterServiceinstead and you can access them here by either passing them directly or usemeterRegistry.getMeterRegistry(String name).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I started refactoring to registering in
RemoteClusterService, but realized the result was turning out the same, a single static instance initialized in a static synchronized method. Also we would be leaking knowledge of the metrics being used down in the strategy class. Aren't the single static instances inRemoteConnectionStrategyalready node wide? Perhaps I'm misunderstanding what you mean by 'node wide' and 'per Java instance'. The way it is coded there are only two static counters, registered once.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right that they are registered once as two static fields. But we should not need these static fields and the
synchronizedinitialization. It is more idmoatic to register the metrics once at a common place instead of attempting registration by each individual object. For example, we register repository related metrics once in RepositoriesModule instead of inside each individual repository.Static fields also do not work well with internal cluster tests where multiple nodes are running in the same Java process. We want most objects, including MetricRegistry and its registration, to be per ES node. Static fields break this encapsulation because they are shared by multiple ES nodes in the test cluster.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, makes sense now, thank you for clearing this up Yang, forgive me for it not clicking before. I refactored to register the single metric in
RemoteClusterService. I needed to refactor the unit tests forRemoteClusterServicesince they were creating an unnecessaryRemoteClusterServiceinstance, which would cause duplicate metric registration. Since this is a big change and separate from the focus of this PR I split it off into #137647.