diff --git a/.chloggen/2913.yaml b/.chloggen/2913.yaml new file mode 100644 index 0000000000..a64116cd76 --- /dev/null +++ b/.chloggen/2913.yaml @@ -0,0 +1,8 @@ +change_type: enhancement +component: otel +note: "Add `otel.event.name` attribute for use by non-OTLP exporters and logging libraries." +issues: [2913] +subtext: | + The `otel.event.name` attribute can be used by: + - Non-OTLP exporters to emit the `EventName` + - Applications using existing logging libraries to add event name information that can be used to set the `EventName` field by Collector or SDK components diff --git a/.chloggen/2995-psi.yaml b/.chloggen/2995-psi.yaml new file mode 100644 index 0000000000..99f57a9745 --- /dev/null +++ b/.chloggen/2995-psi.yaml @@ -0,0 +1,11 @@ +change_type: enhancement + +component: system + +note: "Add Linux PSI (Pressure Stall Information) metrics `system.linux.psi.pressure` and `system.linux.psi.total_time` for measuring resource contention." + +issues: [2995] + +subtext: | + PSI metrics track CPU, memory, and I/O resource pressure by measuring the percentage of time tasks are stalled. + These metrics help with workload sizing, detecting productivity losses, and dynamic system management. diff --git a/dependencies.Dockerfile b/dependencies.Dockerfile index eba91fbbb8..8e0472b32a 100644 --- a/dependencies.Dockerfile +++ b/dependencies.Dockerfile @@ -6,7 +6,7 @@ FROM otel/weaver:v0.18.0@sha256:5425ade81dc22ddd840902b0638b4b6a9186fb654c5b50c1d1ccd31299437390 AS weaver # OPA is used to test policies enforced by weaver. -FROM openpolicyagent/opa:1.10.0@sha256:c0814ce7811ecef8f1297a8e55774a1d5422e5c18b996b665acbc126124fab19 AS opa +FROM openpolicyagent/opa:1.10.1@sha256:4715d0574ca2f8c5ecd65b4e5b0833a53aea0f79c31e52a93972b104bbd614fd AS opa # Lychee is used for checking links in documentation. FROM lycheeverse/lychee:sha-0a96dc2@sha256:2d397eb32e4add073deb5af328f7d644538cd62c007892c57b57551b073b6a12 AS lychee diff --git a/docs/registry/attributes/otel.md b/docs/registry/attributes/otel.md index 4f8d7e45fd..63ca4b9f28 100644 --- a/docs/registry/attributes/otel.md +++ b/docs/registry/attributes/otel.md @@ -5,6 +5,7 @@ - [OTel Attributes](#otel-attributes) - [OTel Component Attributes](#otel-component-attributes) +- [OTel Event Attributes](#otel-event-attributes) - [OTel Scope Attributes](#otel-scope-attributes) - [Deprecated OTel Library Attributes](#deprecated-otel-library-attributes) @@ -101,6 +102,18 @@ E.g. for Java the fully qualified classname SHOULD be used in this case. | `simple_span_processor` | The builtin SDK simple span processor | ![Development](https://img.shields.io/badge/-development-blue) | | `zipkin_http_span_exporter` | Zipkin span exporter over HTTP | ![Development](https://img.shields.io/badge/-development-blue) | +## OTel Event Attributes + +Attributes used by non-OTLP exporters to represent OpenTelemetry Event's concepts. + +**Attributes:** + +| Key | Stability | Value Type | Description | Example Values | +|---|---|---|---|---| +| `otel.event.name` | ![Development](https://img.shields.io/badge/-development-blue) | string | Identifies the class / type of event. [3] | `browser.mouse.click`; `device.app.lifecycle` | + +**[3] `otel.event.name`:** This attribute SHOULD be used by non-OTLP exporters when destination does not support `EventName` or equivalent field. This attribute MAY be used by applications using existing logging libraries so that it can be used to set the `EventName` field by Collector or SDK components. + ## OTel Scope Attributes Attributes used by non-OTLP exporters to represent OpenTelemetry Scope's concepts. diff --git a/docs/registry/attributes/system.md b/docs/registry/attributes/system.md index 403b4adf22..b022c90ef0 100644 --- a/docs/registry/attributes/system.md +++ b/docs/registry/attributes/system.md @@ -7,6 +7,7 @@ - [Filesystem Attributes](#filesystem-attributes) - [System Memory Attributes](#system-memory-attributes) - [System Paging Attributes](#system-paging-attributes) +- [System PSI (Pressure Stall Information) Attributes](#system-psi-pressure-stall-information-attributes) - [Deprecated System Attributes](#deprecated-system-attributes) ## General System Attributes @@ -127,6 +128,45 @@ Describes System Memory Paging attributes | `free` | free | ![Development](https://img.shields.io/badge/-development-blue) | | `used` | used | ![Development](https://img.shields.io/badge/-development-blue) | +## System PSI (Pressure Stall Information) Attributes + +Describes Linux Pressure Stall Information attributes + +**Attributes:** + +| Key | Stability | Value Type | Description | Example Values | +|---|---|---|---|---| +| `system.psi.resource` | ![Development](https://img.shields.io/badge/-development-blue) | string | The resource experiencing pressure [2] | `cpu`; `memory`; `io` | +| `system.psi.stall_type` | ![Development](https://img.shields.io/badge/-development-blue) | string | The PSI stall type | `some`; `full` | +| `system.psi.window` | ![Development](https://img.shields.io/badge/-development-blue) | int | The time window over which pressure is calculated in seconds. [3] | `10`; `60`; `300` | + +**[2] `system.psi.resource`:** Linux PSI (Pressure Stall Information) measures resource pressure for CPU, memory, and I/O. See [Linux kernel PSI documentation](https://docs.kernel.org/accounting/psi.html). + +**[3] `system.psi.window`:** PSI tracks pressure as percentages over 10-second, 60-second, and 300-second windows. This attribute identifies which time window the metric represents. + +--- + +`system.psi.resource` has the following list of well-known values. If one of them applies, then the respective value MUST be used; otherwise, a custom value MAY be used. + +| Value | Description | Stability | +|---|---|---| +| `cpu` | CPU resource pressure | ![Development](https://img.shields.io/badge/-development-blue) | +| `io` | I/O resource pressure | ![Development](https://img.shields.io/badge/-development-blue) | +| `memory` | Memory resource pressure | ![Development](https://img.shields.io/badge/-development-blue) | + +--- + +`system.psi.stall_type` has the following list of well-known values. If one of them applies, then the respective value MUST be used; otherwise, a custom value MAY be used. + +| Value | Description | Stability | +|---|---|---| +| `full` | All non-idle tasks are stalled on the resource simultaneously [4] | ![Development](https://img.shields.io/badge/-development-blue) | +| `some` | At least some tasks are stalled on the resource [5] | ![Development](https://img.shields.io/badge/-development-blue) | + +**[4]:** The "full" line indicates the share of time in which all non-idle tasks are stalled on a given resource simultaneously. This represents a state where actual CPU cycles are going to waste and the workload is thrashing. CPU full is undefined at the system level and is set to zero for backward compatibility (available since Linux 5.13). + +**[5]:** The "some" line indicates the share of time in which at least some tasks are stalled on a given resource. + ## Deprecated System Attributes Deprecated system attributes. diff --git a/docs/system/system-metrics.md b/docs/system/system-metrics.md index 703d3108b2..250581f7b6 100644 --- a/docs/system/system-metrics.md +++ b/docs/system/system-metrics.md @@ -60,6 +60,9 @@ Resource attributes related to a host, SHOULD be reported under the `host.*` nam - [`system.memory.{os}.` - OS Specific System Memory Metrics](#systemmemoryos---os-specific-system-memory-metrics) - [Metric: `system.memory.linux.available`](#metric-systemmemorylinuxavailable) - [Metric: `system.memory.linux.slab.usage`](#metric-systemmemorylinuxslabusage) +- [Linux PSI (Pressure Stall Information) metrics](#linux-psi-pressure-stall-information-metrics) + - [Metric: `system.linux.psi.pressure`](#metric-systemlinuxpsipressure) + - [Metric: `system.linux.psi.total_time`](#metric-systemlinuxpsitotal_time) @@ -1291,3 +1294,154 @@ See also the [Slab allocator](https://blogs.oracle.com/linux/post/understanding- + +## Linux PSI (Pressure Stall Information) metrics + +**Description:** Linux Pressure Stall Information (PSI) metrics captured under the namespace `system.linux.psi`. + +PSI is a Linux kernel feature (available since kernel 4.20) that identifies and +quantifies resource contention. It measures the time impact that resource +crunches have on workloads by tracking the percentage of time tasks are stalled +waiting for CPU, memory, or I/O resources. + +PSI helps in: + +- Sizing workloads to hardware or provisioning hardware according to workload demand +- Detecting productivity losses caused by resource scarcity +- Dynamic system management (load shedding, job migration, strategic pausing) +- Maximizing hardware utilization without sacrificing workload health + +For more details, see the [Linux kernel PSI documentation](https://docs.kernel.org/accounting/psi.html). + +### Metric: `system.linux.psi.pressure` + +This metric is [recommended][MetricRecommended]. + + + + + + + + +| Name | Instrument Type | Unit (UCUM) | Description | Stability | Entity Associations | +| -------- | --------------- | ----------- | -------------- | --------- | ------ | +| `system.linux.psi.pressure` | Gauge | `1` | Linux Pressure Stall Information (PSI) metric measuring resource contention as percentage of time. [1] | ![Development](https://img.shields.io/badge/-development-blue) | [`host`](/docs/registry/entities/host.md#host) | + +**[1]:** PSI (Pressure Stall Information) identifies and quantifies resource contention. +The metric represents the percentage of time that tasks were stalled on a given resource +over the specified time window. + +PSI is available on Linux systems with kernel 4.20 or later and requires CONFIG_PSI=y. +CPU "full" stall is reported as zero at the system level for backward compatibility (available since 5.13). + +The ratios are tracked over 10-second, 60-second and 300-second windows. + +See [Linux kernel PSI documentation](https://docs.kernel.org/accounting/psi.html) + +**Attributes:** + +| Key | Stability | [Requirement Level](https://opentelemetry.io/docs/specs/semconv/general/attribute-requirement-level/) | Value Type | Description | Example Values | +|---|---|---|---|---|---| +| [`system.psi.resource`](/docs/registry/attributes/system.md) | ![Development](https://img.shields.io/badge/-development-blue) | `Required` | string | The resource experiencing pressure [1] | `cpu`; `memory`; `io` | +| [`system.psi.stall_type`](/docs/registry/attributes/system.md) | ![Development](https://img.shields.io/badge/-development-blue) | `Required` | string | The PSI stall type | `some`; `full` | +| [`system.psi.window`](/docs/registry/attributes/system.md) | ![Development](https://img.shields.io/badge/-development-blue) | `Required` | int | The time window over which pressure is calculated in seconds. [2] | `10`; `60`; `300` | + +**[1] `system.psi.resource`:** Linux PSI (Pressure Stall Information) measures resource pressure for CPU, memory, and I/O. See [Linux kernel PSI documentation](https://docs.kernel.org/accounting/psi.html). + +**[2] `system.psi.window`:** PSI tracks pressure as percentages over 10-second, 60-second, and 300-second windows. This attribute identifies which time window the metric represents. + +--- + +`system.psi.resource` has the following list of well-known values. If one of them applies, then the respective value MUST be used; otherwise, a custom value MAY be used. + +| Value | Description | Stability | +|---|---|---| +| `cpu` | CPU resource pressure | ![Development](https://img.shields.io/badge/-development-blue) | +| `io` | I/O resource pressure | ![Development](https://img.shields.io/badge/-development-blue) | +| `memory` | Memory resource pressure | ![Development](https://img.shields.io/badge/-development-blue) | + +--- + +`system.psi.stall_type` has the following list of well-known values. If one of them applies, then the respective value MUST be used; otherwise, a custom value MAY be used. + +| Value | Description | Stability | +|---|---|---| +| `full` | All non-idle tasks are stalled on the resource simultaneously [3] | ![Development](https://img.shields.io/badge/-development-blue) | +| `some` | At least some tasks are stalled on the resource [4] | ![Development](https://img.shields.io/badge/-development-blue) | + +**[3]:** The "full" line indicates the share of time in which all non-idle tasks are stalled on a given resource simultaneously. This represents a state where actual CPU cycles are going to waste and the workload is thrashing. CPU full is undefined at the system level and is set to zero for backward compatibility (available since Linux 5.13). + +**[4]:** The "some" line indicates the share of time in which at least some tasks are stalled on a given resource. + + + + + + +### Metric: `system.linux.psi.total_time` + +This metric is [recommended][MetricRecommended]. + + + + + + + + +| Name | Instrument Type | Unit (UCUM) | Description | Stability | Entity Associations | +| -------- | --------------- | ----------- | -------------- | --------- | ------ | +| `system.linux.psi.total_time` | Counter | `s` | Linux Pressure Stall Information (PSI) total cumulative stall time. [1] | ![Development](https://img.shields.io/badge/-development-blue) | [`host`](/docs/registry/entities/host.md#host) | + +**[1]:** This metric tracks the total absolute stall time since system boot. +Unlike the percentage-based `system.linux.psi.pressure` metric, this allows detection +of latency spikes that wouldn't necessarily make a noticeable impact on time averages. +It also enables calculating average trends over custom time frames. + +PSI is available on Linux systems with kernel 4.20 or later and requires CONFIG_PSI=y. +CPU "full" stall is reported as zero at the system level for backward compatibility (available since 5.13). + +This is a monotonically increasing counter that resets on system reboot. + +Linux exposes this metric in microseconds. Following OpenTelemetry guidelines for measuring durations, +this metric uses seconds. + +See [Linux kernel PSI documentation](https://docs.kernel.org/accounting/psi.html) + +**Attributes:** + +| Key | Stability | [Requirement Level](https://opentelemetry.io/docs/specs/semconv/general/attribute-requirement-level/) | Value Type | Description | Example Values | +|---|---|---|---|---|---| +| [`system.psi.resource`](/docs/registry/attributes/system.md) | ![Development](https://img.shields.io/badge/-development-blue) | `Required` | string | The resource experiencing pressure [1] | `cpu`; `memory`; `io` | +| [`system.psi.stall_type`](/docs/registry/attributes/system.md) | ![Development](https://img.shields.io/badge/-development-blue) | `Required` | string | The PSI stall type | `some`; `full` | + +**[1] `system.psi.resource`:** Linux PSI (Pressure Stall Information) measures resource pressure for CPU, memory, and I/O. See [Linux kernel PSI documentation](https://docs.kernel.org/accounting/psi.html). + +--- + +`system.psi.resource` has the following list of well-known values. If one of them applies, then the respective value MUST be used; otherwise, a custom value MAY be used. + +| Value | Description | Stability | +|---|---|---| +| `cpu` | CPU resource pressure | ![Development](https://img.shields.io/badge/-development-blue) | +| `io` | I/O resource pressure | ![Development](https://img.shields.io/badge/-development-blue) | +| `memory` | Memory resource pressure | ![Development](https://img.shields.io/badge/-development-blue) | + +--- + +`system.psi.stall_type` has the following list of well-known values. If one of them applies, then the respective value MUST be used; otherwise, a custom value MAY be used. + +| Value | Description | Stability | +|---|---|---| +| `full` | All non-idle tasks are stalled on the resource simultaneously [2] | ![Development](https://img.shields.io/badge/-development-blue) | +| `some` | At least some tasks are stalled on the resource [3] | ![Development](https://img.shields.io/badge/-development-blue) | + +**[2]:** The "full" line indicates the share of time in which all non-idle tasks are stalled on a given resource simultaneously. This represents a state where actual CPU cycles are going to waste and the workload is thrashing. CPU full is undefined at the system level and is set to zero for backward compatibility (available since Linux 5.13). + +**[3]:** The "some" line indicates the share of time in which at least some tasks are stalled on a given resource. + + + + + diff --git a/internal/tools/go.mod b/internal/tools/go.mod index 2cec16891f..c0c200f06c 100644 --- a/internal/tools/go.mod +++ b/internal/tools/go.mod @@ -2,7 +2,7 @@ module github.com/open-telemetry/opentelemetry-specification/internal/tools go 1.24.0 -toolchain go1.25.3 +toolchain go1.25.4 require ( github.com/client9/misspell v0.3.4 diff --git a/model/otel/registry.yaml b/model/otel/registry.yaml index 916df907b0..1af13d9e08 100644 --- a/model/otel/registry.yaml +++ b/model/otel/registry.yaml @@ -76,6 +76,23 @@ groups: brief: The schema URL of the instrumentation scope. examples: ['https://opentelemetry.io/schemas/1.31.0'] stability: development + - id: registry.otel.event + type: attribute_group + display_name: OTel Event Attributes + brief: Attributes used by non-OTLP exporters to represent OpenTelemetry Event's concepts. + attributes: + - id: otel.event.name + type: string + stability: development + brief: > + Identifies the class / type of event. + note: > + This attribute SHOULD be used by non-OTLP exporters + when destination does not support `EventName` or equivalent field. + This attribute MAY be used by applications using existing logging + libraries so that it can be used to set the `EventName` field by + Collector or SDK components. + examples: ["browser.mouse.click", "device.app.lifecycle"] - id: registry.otel.component type: attribute_group display_name: OTel Component Attributes diff --git a/model/system/metrics.yaml b/model/system/metrics.yaml index 7a13fdb91a..bfb83725bd 100644 --- a/model/system/metrics.yaml +++ b/model/system/metrics.yaml @@ -563,3 +563,68 @@ groups: - ref: system.memory.linux.slab.state entity_associations: - host + + # system.linux.psi.* metrics + - id: metric.system.linux.psi.pressure + type: metric + metric_name: system.linux.psi.pressure + annotations: + code_generation: + metric_value_type: double + stability: development + brief: "Linux Pressure Stall Information (PSI) metric measuring resource contention as percentage of time." + note: | + PSI (Pressure Stall Information) identifies and quantifies resource contention. + The metric represents the percentage of time that tasks were stalled on a given resource + over the specified time window. + + PSI is available on Linux systems with kernel 4.20 or later and requires CONFIG_PSI=y. + CPU "full" stall is reported as zero at the system level for backward compatibility (available since 5.13). + + The ratios are tracked over 10-second, 60-second and 300-second windows. + + See [Linux kernel PSI documentation](https://docs.kernel.org/accounting/psi.html) + instrument: gauge + unit: "1" + attributes: + - ref: system.psi.resource + requirement_level: required + - ref: system.psi.stall_type + requirement_level: required + - ref: system.psi.window + requirement_level: required + entity_associations: + - host + + - id: metric.system.linux.psi.total_time + type: metric + metric_name: system.linux.psi.total_time + annotations: + code_generation: + metric_value_type: double + stability: development + brief: "Linux Pressure Stall Information (PSI) total cumulative stall time." + note: | + This metric tracks the total absolute stall time since system boot. + Unlike the percentage-based `system.linux.psi.pressure` metric, this allows detection + of latency spikes that wouldn't necessarily make a noticeable impact on time averages. + It also enables calculating average trends over custom time frames. + + PSI is available on Linux systems with kernel 4.20 or later and requires CONFIG_PSI=y. + CPU "full" stall is reported as zero at the system level for backward compatibility (available since 5.13). + + This is a monotonically increasing counter that resets on system reboot. + + Linux exposes this metric in microseconds. Following OpenTelemetry guidelines for measuring durations, + this metric uses seconds. + + See [Linux kernel PSI documentation](https://docs.kernel.org/accounting/psi.html) + instrument: counter + unit: "s" + attributes: + - ref: system.psi.resource + requirement_level: required + - ref: system.psi.stall_type + requirement_level: required + entity_associations: + - host diff --git a/model/system/registry.yaml b/model/system/registry.yaml index fab5dea6c9..ee74c62459 100644 --- a/model/system/registry.yaml +++ b/model/system/registry.yaml @@ -155,3 +155,61 @@ groups: stability: development brief: "The filesystem mount path" examples: ["/mnt/data"] + # system.psi.* attribute group + - id: registry.system.psi + type: attribute_group + display_name: System PSI (Pressure Stall Information) Attributes + brief: "Describes Linux Pressure Stall Information attributes" + attributes: + - id: system.psi.resource + type: + members: + - id: cpu + value: 'cpu' + stability: development + brief: "CPU resource pressure" + - id: memory + value: 'memory' + stability: development + brief: "Memory resource pressure" + - id: io + value: 'io' + stability: development + brief: "I/O resource pressure" + stability: development + brief: "The resource experiencing pressure" + examples: ["cpu", "memory", "io"] + note: > + Linux PSI (Pressure Stall Information) measures resource pressure for CPU, memory, and I/O. + See [Linux kernel PSI documentation](https://docs.kernel.org/accounting/psi.html). + - id: system.psi.stall_type + type: + members: + - id: some + value: 'some' + stability: development + brief: "At least some tasks are stalled on the resource" + note: > + The "some" line indicates the share of time in which at least some + tasks are stalled on a given resource. + - id: full + value: 'full' + stability: development + brief: "All non-idle tasks are stalled on the resource simultaneously" + note: > + The "full" line indicates the share of time in which all non-idle + tasks are stalled on a given resource simultaneously. This represents + a state where actual CPU cycles are going to waste and the workload + is thrashing. CPU full is undefined at the system level and is set to + zero for backward compatibility (available since Linux 5.13). + stability: development + brief: "The PSI stall type" + examples: ["some", "full"] + - id: system.psi.window + type: int + stability: development + brief: "The time window over which pressure is calculated in seconds." + examples: [10, 60, 300] + note: > + PSI tracks pressure as percentages over 10-second, 60-second, and 300-second windows. + This attribute identifies which time window the metric represents.