Skip to content
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -137,14 +137,24 @@ class APMJvmOptions {
*/
static List<String> apmJvmOptions(Settings settings, @Nullable SecureSettings secrets, Path logsDir, Path tmpdir) throws UserException,
IOException {
final Path agentJar = findAgentJar();
boolean tracingEnabled = settings.getAsBoolean("telemetry.tracing.enabled", false);
boolean metricsEnabled = settings.getAsBoolean("telemetry.metrics.enabled", false);
boolean agentMetricsEnabled = settings.getAsBoolean("telemetry.otel.metrics.enabled", false) == false;
boolean attachAgent = tracingEnabled || (metricsEnabled && agentMetricsEnabled);

if (agentJar == null) {
final Path agentJar = findAgentJar(System.getProperty("user.dir"));

if (attachAgent == false || agentJar == null) {
return List.of();
}

final Map<String, String> propertiesMap = extractApmSettings(settings);

if (metricsEnabled == false || agentMetricsEnabled == false) {
propertiesMap.put("metrics_interval", "0s");
propertiesMap.put("disable_metrics", "*");
}

// Configures a log file to write to. Don't disable writing to a log file,
// as the agent will then require extra Security Manager permissions when
// it tries to do something else, and it's just painful.
Expand Down
72 changes: 72 additions & 0 deletions gradle/verification-metadata.xml
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,11 @@
<sha256 value="8540247fad9e06baefa8fb45eb313802d019f485f14300e0f9d6b556ed88e753" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="com.google.protobuf" name="protobuf-java" version="4.28.3">
<artifact name="protobuf-java-4.28.3.jar">
<sha256 value="ba02977c0fef8b40af9f85fe69af362d8e13f2685b49a9752750b18da726157e" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="com.google.protobuf" name="protobuf-java" version="4.32.0">
<artifact name="protobuf-java-4.32.0.jar">
<sha256 value="e902c91b454812b7d056b8d303a572733bf0587576ff157c1049116c9626241d" origin="Generated by Gradle"/>
Expand Down Expand Up @@ -1794,6 +1799,16 @@
<sha256 value="1991903b9fc76b27f1e6a70a5a97131668fb5f3ac9026178c450d510cbb1bef2" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry" name="opentelemetry-api" version="1.57.0">
<artifact name="opentelemetry-api-1.57.0.jar">
<sha256 value="88bbfc3fcc81af3299e7f69cddcc9672e410ffcf7435e8b50a474cf2e454b410" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry" name="opentelemetry-api-incubator" version="1.57.0-alpha">
<artifact name="opentelemetry-api-incubator-1.57.0-alpha.jar">
<sha256 value="1e2fe498522d00b674d42b9db8819f46d455bd541937fbd44a5a920b8bb6e91e" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry" name="opentelemetry-common" version="1.53.0">
<artifact name="opentelemetry-common-1.53.0.jar">
<sha256 value="73a5181dd07e72c4312fadafc8328ad1046a32c050030e3a7b8c16113daad359" origin="Generated by Gradle"/>
Expand Down Expand Up @@ -1824,6 +1839,11 @@
<sha256 value="d7f093f987547c9c2c2caa28baf1254507c59b4dbf97b49fee5c9cd1ceb6f8d5" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry" name="opentelemetry-exporter-logging" version="1.53.0">
<artifact name="opentelemetry-exporter-logging-1.53.0.jar">
<sha256 value="acd7b018a6076b1365b3520b6fc1fcf489f3da2f3e1a90ca052cab4dee6c8247" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry" name="opentelemetry-exporter-otlp" version="1.53.0">
<artifact name="opentelemetry-exporter-otlp-1.53.0.jar">
<sha256 value="0ce88bf35577894ee185f718f1b981938bbfc8981f3b7fcecbfd7b7c375bc236" origin="Generated by Gradle"/>
Expand Down Expand Up @@ -1864,16 +1884,63 @@
<sha256 value="e814e63dc2f8cbdf84574b6289eb543b89b06486d03cad1e697b06d802ce27bb" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry" name="opentelemetry-sdk-logs" version="1.53.0">
<artifact name="opentelemetry-sdk-logs-1.53.0.jar">
<sha256 value="ae1c30170bfe71cb91e2fc0d193c8899aaba5acc90c80a219a4080c2817b9bf8" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry" name="opentelemetry-sdk-metrics" version="1.53.0">
<artifact name="opentelemetry-sdk-metrics-1.53.0.jar">
<sha256 value="3d7dbae6c03be035e7c4dbdd50442741b260be5463fa9aadca1b00af0efddee5" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry" name="opentelemetry-sdk-testing" version="1.53.0">
<artifact name="opentelemetry-sdk-testing-1.53.0.jar">
<sha256 value="26b115af66c7b2ea089bde66fc733647a2cae024e1ecf6a84a870033c7964214" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry" name="opentelemetry-sdk-trace" version="1.53.0">
<artifact name="opentelemetry-sdk-trace-1.53.0.jar">
<sha256 value="6d26512cbf7434c03a25f3828886f8866f43c683339ba98eaccce74471fcb40a" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry.proto" name="opentelemetry-proto" version="1.5.0-alpha">
<artifact name="opentelemetry-proto-1.5.0-alpha.jar">
<sha256 value="03bab813f054fd931f9ccc9bf63e199b8ba33c91f661787f99a916a50a26eb32" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry" name="opentelemetry-semconv" version="1.21.0-alpha">
<artifact name="opentelemetry-semconv-1.21.0-alpha.jar">
<sha256 value="4a8f41b93eec51e85fa6b48e43de6785b742316fdd9c9baf595adbce6d5de6af" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry.instrumentation" name="opentelemetry-instrumentation-api" version="2.23.0">
<artifact name="opentelemetry-instrumentation-api-2.23.0.jar">
<sha256 value="c6b86acc7ccb2a6a0979ae73ae686ea4ac8a24e656912bc169216bcfd87f44b1" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry.instrumentation" name="opentelemetry-instrumentation-api-incubator" version="2.23.0-alpha">
<artifact name="opentelemetry-instrumentation-api-incubator-2.23.0-alpha.jar">
<sha256 value="4f068fad9ea65cb1f2213ca50ba154878e7e12c87291f2f2182137e26ab25c31" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry.instrumentation" name="opentelemetry-runtime-telemetry-java17" version="2.23.0-alpha">
<artifact name="opentelemetry-runtime-telemetry-java17-2.23.0-alpha.jar">
<sha256 value="c6eed0860e7a46b39173566eff70d4b85d954ba62aeaf9affda281a1c2fd102d" origin="Generated by Gradle">
<also-trust value="4489144d20e9ea67405e340f399b840db6a89dcbf1ecd85ee7a8beef81629667"/>
</sha256>
</artifact>
</component>
<component group="io.opentelemetry.instrumentation" name="opentelemetry-runtime-telemetry-java8" version="2.23.0-alpha">
<artifact name="opentelemetry-runtime-telemetry-java8-2.23.0-alpha.jar">
<sha256 value="71df1c056b9dca5c41077c70d4028a0f4b45c2b089509727fcfd8672afb5504e" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.opentelemetry.semconv" name="opentelemetry-semconv" version="1.37.0">
<artifact name="opentelemetry-semconv-1.37.0.jar">
<sha256 value="693ad6f04f29b4b593a04adef5f575d28b3a91ea3449ab5b1e1e2e5c6efc6cdc" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="io.ous" name="jtoml" version="2.0.0">
<artifact name="jtoml-2.0.0.jar">
<sha256 value="3cabdae2244c999addebb8c31ae452fbdc874b4f26a163539954b8eeb5d6acc6" origin="Generated by Gradle"/>
Expand Down Expand Up @@ -3200,6 +3267,11 @@
<sha256 value="b4a1796fab7bfc36df015c1b4052459147997e8d215a7199d71d05f9e747e4f4" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="org.apache.logging.log4j" name="log4j-jul" version="2.19.0">
<artifact name="log4j-jul-2.19.0.jar">
<sha256 value="c3f0cbd1e455b1f3443c1bf0860fa3a91f5ae721d1acfeff393629fdefc23b6b" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="org.apache.logging.log4j" name="log4j-core" version="2.20.0">
<artifact name="log4j-core-2.20.0.jar">
<sha256 value="6137df848cdaed9f4d5076f75513c6c85da80b953f4e7acca38098b770763f55" origin="Generated by Gradle"/>
Expand Down
56 changes: 52 additions & 4 deletions modules/apm/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,67 @@ esplugin {
classname ='org.elasticsearch.telemetry.apm.APM'
}

def otelVersion = '1.31.0'
def otelSemconvVersion = '1.21.0-alpha'
def otelVersion = '1.53.0'
def otelInstrumentationVersion = '2.23.0-alpha'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like a major bump, should we do that in a separate change to ensure it doesn't have any side effects, apart from this PR? Also, why are we using "alpha" versions?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is one of the first things I touched, thanks for highlighting, it needs more work!

For the version bumps, there's a couple of reasons:

  1. Between 1.31.0 and 1.53.0 (but now 1.59.0) there are quite a few quality of life improvements around OTLP exporter, but most importantly 1.52.0 contains this fix for flushing metrics, which I think is important for our use case.
  2. For the instrumentation version, I want to use the release that targets the specific SDK version picked, which would be 2.19.0 as recommended here

I am happy to separate this into it's own PR to be merged before all this if it's better!


dependencies {
implementation "io.opentelemetry:opentelemetry-api:${otelVersion}"
implementation "io.opentelemetry:opentelemetry-context:${otelVersion}"
implementation "io.opentelemetry:opentelemetry-semconv:${otelSemconvVersion}"
runtimeOnly "co.elastic.apm:elastic-apm-agent-java8:1.55.0"
implementation "io.opentelemetry:opentelemetry-sdk:${otelVersion}"
implementation "io.opentelemetry:opentelemetry-sdk-metrics:${otelVersion}"
implementation("io.opentelemetry:opentelemetry-exporter-otlp:${otelVersion}") {
exclude group: 'io.opentelemetry', module: 'opentelemetry-exporter-sender-okhttp'
}
implementation "io.opentelemetry.instrumentation:opentelemetry-runtime-telemetry-java17:${otelInstrumentationVersion}"
implementation "io.opentelemetry.instrumentation:opentelemetry-runtime-telemetry-java8:${otelInstrumentationVersion}"

implementation "io.opentelemetry:opentelemetry-sdk-common:${otelVersion}"
runtimeOnly "io.opentelemetry:opentelemetry-common:${otelVersion}"
runtimeOnly "io.opentelemetry:opentelemetry-sdk-trace:${otelVersion}"
runtimeOnly "io.opentelemetry:opentelemetry-sdk-logs:${otelVersion}"
runtimeOnly "io.opentelemetry:opentelemetry-exporter-common:${otelVersion}"
runtimeOnly "io.opentelemetry:opentelemetry-exporter-otlp-common:${otelVersion}"
runtimeOnly "io.opentelemetry:opentelemetry-exporter-sender-jdk:${otelVersion}"
runtimeOnly "io.opentelemetry:opentelemetry-sdk-extension-autoconfigure-spi:${otelVersion}"
runtimeOnly "co.elastic.apm:elastic-apm-agent-java8:1.55.0"

testImplementation "io.opentelemetry:opentelemetry-sdk-testing:${otelVersion}"

javaRestTestImplementation project(':modules:apm')
javaRestTestImplementation project(':test:framework')
}

tasks.named("thirdPartyAudit").configure {
ignoreMissingClasses(
'com.fasterxml.jackson.core.JsonFactory',
'com.fasterxml.jackson.core.JsonGenerator',
'com.google.common.io.ByteStreams',
'com.google.common.util.concurrent.ListenableFuture',
'io.grpc.CallOptions',
'io.grpc.Channel',
'io.grpc.Drainable',
'io.grpc.KnownLength',
'io.grpc.ManagedChannel',
'io.grpc.MethodDescriptor',
'io.grpc.MethodDescriptor$Builder',
'io.grpc.MethodDescriptor$Marshaller',
'io.grpc.MethodDescriptor$MethodType',
'io.grpc.stub.AbstractFutureStub',
'io.grpc.stub.AbstractStub',
'io.grpc.stub.ClientCalls'
)
ignoreViolations(
// uses internal java api: sun.misc.Unsafe
'io.opentelemetry.internal.shaded.jctools.util.UnsafeRefArrayAccess',
'io.opentelemetry.internal.shaded.jctools.util.UnsafeAccess',
'io.opentelemetry.exporter.internal.marshal.UnsafeAccess',
'io.opentelemetry.exporter.internal.marshal.UnsafeAccess$UnsafeHolder',
'io.opentelemetry.internal.shaded.jctools.queues.MpscArrayQueueProducerLimitField',
'io.opentelemetry.internal.shaded.jctools.queues.MpscArrayQueueProducerIndexField',
'io.opentelemetry.internal.shaded.jctools.queues.MpscArrayQueueConsumerIndexField'
)
}

tasks.named("dependencyLicenses").configure {
mapping from: /opentelemetry-.*/, to: 'opentelemetry'
}
9 changes: 8 additions & 1 deletion modules/apm/src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,15 @@
requires org.elasticsearch.xcontent;
requires org.apache.logging.log4j;
requires org.apache.lucene.core;
requires io.opentelemetry.context;
requires java.management;
requires jdk.management;
requires io.opentelemetry.api;
requires io.opentelemetry.context;
requires io.opentelemetry.sdk;
requires io.opentelemetry.sdk.metrics;
requires io.opentelemetry.exporter.otlp;
requires io.opentelemetry.instrumentation.runtime_telemetry_java17;
requires io.opentelemetry.sdk.common;

exports org.elasticsearch.telemetry.apm;
exports org.elasticsearch.telemetry.apm.metrics;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,9 @@ public List<Setting<?>> getSettings() {
APMAgentSettings.TELEMETRY_TRACING_ENABLED_SETTING,
APMAgentSettings.TELEMETRY_TRACING_NAMES_INCLUDE_SETTING,
APMAgentSettings.TELEMETRY_TRACING_NAMES_EXCLUDE_SETTING,
APMAgentSettings.TELEMETRY_TRACING_SANITIZE_FIELD_NAMES
APMAgentSettings.TELEMETRY_TRACING_SANITIZE_FIELD_NAMES,
// OTEL SDK
APMAgentSettings.TELEMETRY_OTEL_METRICS_ENABLED_SETTING
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,13 @@ private static Setting<String> concreteAgentSetting(String namespace, String qua
NodeScope
);

public static final Setting<Boolean> TELEMETRY_OTEL_METRICS_ENABLED_SETTING = Setting.boolSetting(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we intend this setting to last after we have migrated to the otel sdk? If not, I think this would be better as a system property, otherwise proper deprecation/removal would be needed.

TELEMETRY_SETTING_PREFIX + "otel.metrics.enabled",
false,
OperatorDynamic,
NodeScope
);

public static final Setting<SecureString> TELEMETRY_SECRET_TOKEN_SETTING = SecureSetting.secureString(
TELEMETRY_SETTING_PREFIX + "secret_token",
null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,13 @@

public class APMMeterService extends AbstractLifecycleComponent {
private final APMMeterRegistry meterRegistry;

private final Supplier<Meter> otelMeterSupplier;
private final Supplier<Meter> noopMeterSupplier;

protected volatile boolean enabled;

public APMMeterService(Settings settings) {
this(settings, APMMeterService.otelMeter(), APMMeterService.noopMeter());
this(settings, createOtelMeterSupplier(settings), () -> OpenTelemetry.noop().getMeter("noop"));
}

public APMMeterService(Settings settings, Supplier<Meter> otelMeterSupplier, Supplier<Meter> noopMeterSupplier) {
Expand All @@ -40,7 +39,17 @@ public APMMeterService(boolean enabled, Supplier<Meter> otelMeterSupplier, Suppl
this.enabled = enabled;
this.otelMeterSupplier = otelMeterSupplier;
this.noopMeterSupplier = noopMeterSupplier;
this.meterRegistry = new APMMeterRegistry(enabled ? createOtelMeter() : createNoopMeter());
this.meterRegistry = new APMMeterRegistry(enabled ? otelMeterSupplier.get() : noopMeterSupplier.get());
if (enabled && otelMeterSupplier instanceof OtelSdkMeterSupplier) {
SystemMetrics.register(meterRegistry);
}
}

private static Supplier<Meter> createOtelMeterSupplier(Settings settings) {
if (APMAgentSettings.TELEMETRY_OTEL_METRICS_ENABLED_SETTING.get(settings) == false) {
return () -> GlobalOpenTelemetry.get().getMeter("elasticsearch");
}
return new OtelSdkMeterSupplier(settings);
}

public APMMeterRegistry getMeterRegistry() {
Expand All @@ -52,41 +61,20 @@ public APMMeterRegistry getMeterRegistry() {
*/
void setEnabled(boolean enabled) {
this.enabled = enabled;
if (enabled) {
meterRegistry.setProvider(createOtelMeter());
} else {
meterRegistry.setProvider(createNoopMeter());
}
meterRegistry.setProvider(enabled ? otelMeterSupplier.get() : noopMeterSupplier.get());
}

@Override
protected void doStart() {}

@Override
protected void doStop() {
meterRegistry.setProvider(createNoopMeter());
if (otelMeterSupplier instanceof OtelSdkMeterSupplier otelSdk) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this would be less clunky if checking for Closeable.

otelSdk.close();
}
meterRegistry.setProvider(noopMeterSupplier.get());
}

@Override
protected void doClose() {}

protected Meter createOtelMeter() {
assert this.enabled;
return otelMeterSupplier.get();
}

protected Meter createNoopMeter() {
return noopMeterSupplier.get();
}

protected static Supplier<Meter> noopMeter() {
return () -> OpenTelemetry.noop().getMeter("noop");
}

// to be used within doPrivileged block
private static Supplier<Meter> otelMeter() {
var openTelemetry = GlobalOpenTelemetry.get();
var meter = openTelemetry.getMeter("elasticsearch");
return () -> meter;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,13 @@ public class MetricValidator {
"es.thread_pool.searchable_snapshots_cache_fetch_async.*",
"es.thread_pool.searchable_snapshots_cache_prewarming.*",
"es.thread_pool.security-crypto.*",
"es.thread_pool.security-token-key.*"
"es.thread_pool.security-token-key.*",
// APM Java agent-compatible metric names (see https://www.elastic.co/docs/reference/apm/agents/java/metrics#metrics-jvm)
"system.memory.*",
"system.process.*",
"jvm.fd.*",
"jvm.gc.*",
"jvm.memory.*"
);

/**
Expand Down Expand Up @@ -242,7 +248,10 @@ private static class Attributes {
Map.entry("es.tsdb.downsample.actions.shard.total", DOWNSAMPLE_ATTRIBUTES),
Map.entry("es.tsdb.downsample.actions.total", DOWNSAMPLE_ATTRIBUTES),
Map.entry("es.tsdb.downsample.latency.shard.histogram", DOWNSAMPLE_ATTRIBUTES),
Map.entry("es.tsdb.downsample.latency.total.histogram", DOWNSAMPLE_ATTRIBUTES)
Map.entry("es.tsdb.downsample.latency.total.histogram", DOWNSAMPLE_ATTRIBUTES),
// APM Java agent-compatible metrics (see https://www.elastic.co/docs/reference/apm/agents/java/metrics#metrics-jvm)
Map.entry("jvm.gc.count", Set.of("name")),
Map.entry("jvm.gc.time", Set.of("name"))
);

// forbidden attributes known to cause issues due to mapping conflicts or high cardinality
Expand Down Expand Up @@ -318,6 +327,11 @@ public static void assertValidAttributeNames(String metricName, Map<String, Obje

assert Attributes.OTEL_ATTRIBUTES.contains(attribute)
|| Attributes.SKIP_VALIDATION.getOrDefault(metricName, emptySet()).contains(attribute)
// allow percentile for all thread pools
// https://github.com/elastic/dev/issues/3436 remove the usage of percentile as attribute and move to metric name.
|| (metricName.startsWith("es.thread_pool.") && attribute.equals("percentile"))
// ML metrics use dot-separated attribute key
|| (metricName.startsWith("es.ml.") && attribute.equals("es.ml.is_master"))
|| Attributes.ATTRIBUTE_PATTERN.matcher(attribute).matches()
: Strings.format(
"Attribute [%s] of [%s] does not match the required naming pattern [%s], see the naming guidelines.",
Expand Down
Loading