diff --git a/go.mod b/go.mod index 0a2dc05b9a..85726766b5 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,8 @@ replace collectd.org v0.4.0 => github.com/collectd/go-collectd v0.4.0 // to be all replaced since there are some changes that will always be from upstream replace ( github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251014190537-ae1729ad22c4 + //TODO replace with offical repo after dependent PR merged + github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter => github.com/yanhaoluo666/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251020112017-94065798a087 github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsxrayexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20251014190537-ae1729ad22c4 ) @@ -47,7 +48,8 @@ replace ( ) replace ( - github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251014190537-ae1729ad22c4 + //TODO replace with offical repo after dependent PR merged + github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver => github.com/yanhaoluo666/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251013162607-385c069b9f1f github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver v0.0.0-20251014190537-ae1729ad22c4 github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awsxrayreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20251014190537-ae1729ad22c4 github.com/open-telemetry/opentelemetry-collector-contrib/receiver/jmxreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/jmxreceiver v0.0.0-20251014190537-ae1729ad22c4 @@ -145,6 +147,7 @@ require ( github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatocumulativeprocessor v0.124.0 github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.124.1 github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.124.1 + github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.124.1 github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbytraceprocessor v0.124.1 github.com/open-telemetry/opentelemetry-collector-contrib/processor/k8sattributesprocessor v0.124.1 github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricsgenerationprocessor v0.124.1 diff --git a/go.sum b/go.sum index 19c70558cd..58d14034c4 100644 --- a/go.sum +++ b/go.sum @@ -187,8 +187,6 @@ github.com/aliyun/alibaba-cloud-sdk-go v1.61.1483 h1:J8HaD+Zpfi1gcel3HCKpoHHEsrc github.com/aliyun/alibaba-cloud-sdk-go v1.61.1483/go.mod h1:RcDobYh8k5VP6TNybz9m++gL3ijVI5wueVr0EM10VsU= github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20251014190537-ae1729ad22c4 h1:0gvkXM8HrRo9fu+34OYLVngN87FtstrYjhIjAcOyfTA= github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:8dL1mhunGsDXn59xUlnNn1ydT5wp6Fh5KTvlBEaN2Po= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251014190537-ae1729ad22c4 h1:XLDJlsz7glQ0PcWYZ9S664H2Hyy/xaRHIT5gGlQFtEk= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:dSu6d3FZqrAECatXDhvYsQIEAaL1iF+fokrPwCjxhC8= github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20251014190537-ae1729ad22c4 h1:jn9YrkY2ZLbpT4n6q/EkfSwGH0cx/diKBmlQcYvLTJ8= github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:b9TxHHL62ladWlbU6klYIvDjCN3Ee31oWrKlF50fQns= github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20251014190537-ae1729ad22c4 h1:YQkkr7FxnF52KvG4CIM8a0A/7m6CIQMmtFHqfd8D/PA= @@ -229,8 +227,6 @@ github.com/amazon-contributing/opentelemetry-collector-contrib/processor/cumulat github.com/amazon-contributing/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:Wvs2QPuB4ngUiOjrJpYWLqfU8X0Z27s33uMKP4YHQmE= github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20251014190537-ae1729ad22c4 h1:AbFzE4JNvrwGoyDCgmg3WiTaA1bxPc6xzJCwgHEtzNw= github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:idk0SX/ZWccyRfPyAKPu1uVvd+KBMT0pE75HHFogitY= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251014190537-ae1729ad22c4 h1:kMAB6h54Q/OJDQEP+XCxjcGnyOHq/KuyGiUliGsUFJI= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:2/Yuy1ePzxKLoHdJIS/BBdWfMD+wpkudvxe1HXZuMAM= github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver v0.0.0-20251014190537-ae1729ad22c4 h1:2YfnGwsnhAqUNrOsye+w2xI9rya3KT7RucYFwDDAs30= github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:431fc3JgruV7R3yhAzu7w0fdPaBp1Tbn4RV+8R4Dtdw= github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20251014190537-ae1729ad22c4 h1:81WCB7g7vOUGs5HSNpuHrzMCXXH+XyOJHju/NkygKms= @@ -1278,6 +1274,8 @@ github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatoratep github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.124.1/go.mod h1:bp3Y5GT4dkGWRGEZqKgfanyk6ZSsVGNY5aNDvX4c8WE= github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.124.1 h1:qkqiqLwfg7hj+oDYvpmMD64p+poaxXwo654ZE44uPm4= github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.124.1/go.mod h1:B/GP3l4Y1qNsNtWVIzpwS8jWB1Nn/vx0sFBlVDkWt9E= +github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.124.1 h1:gYlUUIR+lzLQCpj5phh+Ogmk5BRaOrEuKGjIixCk89I= +github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.124.1/go.mod h1:bkAXtBtShDOA8SuF8IpYbhx1BYWUEE1rW10HXXEXW/4= github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbytraceprocessor v0.124.1 h1:6MAKxLXfQWHEadn9AgY1jWdKFTJkLYVBa+/h3Rk23lE= github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbytraceprocessor v0.124.1/go.mod h1:6q2oIAtCuX9HklnqGPO8sWPoTAjhZX1x23O0aTR/zd0= github.com/open-telemetry/opentelemetry-collector-contrib/processor/k8sattributesprocessor v0.124.1 h1:esSFJIhlZaZslW9EYY/Ss5zUnfkuN2qiS+7ujk73/gU= @@ -1590,6 +1588,10 @@ github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHo github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74= github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= +github.com/yanhaoluo666/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251020112017-94065798a087 h1:i6sKcLt+bHgbJs3Hjaea5C/g2HeY8gLnZtS+Gxl+IQw= +github.com/yanhaoluo666/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251020112017-94065798a087/go.mod h1:dSu6d3FZqrAECatXDhvYsQIEAaL1iF+fokrPwCjxhC8= +github.com/yanhaoluo666/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251013162607-385c069b9f1f h1:heA619m3+WxatwdpSaBqCpnlA719p2nvjmSpJDhDy44= +github.com/yanhaoluo666/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251013162607-385c069b9f1f/go.mod h1:2/Yuy1ePzxKLoHdJIS/BBdWfMD+wpkudvxe1HXZuMAM= github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a h1:fZHgsYlfvtyqToslyjUt3VOPF4J7aK/3MPcK7xp3PDk= github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a/go.mod h1:ul22v+Nro/R083muKhosV54bj5niojjWZvU8xrevuH4= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= diff --git a/service/defaultcomponents/components.go b/service/defaultcomponents/components.go index 6602971a7f..acee94c653 100644 --- a/service/defaultcomponents/components.go +++ b/service/defaultcomponents/components.go @@ -19,6 +19,7 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatocumulativeprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor" + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbytraceprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/k8sattributesprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricsgenerationprocessor" @@ -107,6 +108,7 @@ func Factories() (otelcol.Factories, error) { gpuattributes.NewFactory(), kueueattributes.NewFactory(), groupbytraceprocessor.NewFactory(), + groupbyattrsprocessor.NewFactory(), k8sattributesprocessor.NewFactory(), memorylimiterprocessor.NewFactory(), metricsgenerationprocessor.NewFactory(), diff --git a/service/defaultcomponents/components_test.go b/service/defaultcomponents/components_test.go index 98e789ebd4..c77d7e43da 100644 --- a/service/defaultcomponents/components_test.go +++ b/service/defaultcomponents/components_test.go @@ -54,6 +54,7 @@ func TestComponents(t *testing.T) { "gpuattributes", "kueueattributes", "groupbytrace", + "groupbyattrs", "k8sattributes", "memory_limiter", "metricstransform", diff --git a/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml b/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml index e879d58968..6cfea39dde 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml +++ b/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml @@ -1476,6 +1476,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml b/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml index 5d7236220e..039ea34a49 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml +++ b/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml @@ -1477,6 +1477,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/appsignals_fallback_and_eks_config.yaml b/translator/tocwconfig/sampleConfig/appsignals_fallback_and_eks_config.yaml index e879d58968..6cfea39dde 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_fallback_and_eks_config.yaml +++ b/translator/tocwconfig/sampleConfig/appsignals_fallback_and_eks_config.yaml @@ -1476,6 +1476,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/appsignals_over_fallback_config.yaml b/translator/tocwconfig/sampleConfig/appsignals_over_fallback_config.yaml index e879d58968..6cfea39dde 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_over_fallback_config.yaml +++ b/translator/tocwconfig/sampleConfig/appsignals_over_fallback_config.yaml @@ -1476,6 +1476,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml b/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml index 3074a5d230..9a7fbc2b0f 100644 --- a/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml +++ b/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml @@ -196,6 +196,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/container_insights_jmx.yaml b/translator/tocwconfig/sampleConfig/container_insights_jmx.yaml index abc6f62001..a044866ec0 100644 --- a/translator/tocwconfig/sampleConfig/container_insights_jmx.yaml +++ b/translator/tocwconfig/sampleConfig/container_insights_jmx.yaml @@ -509,6 +509,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: true + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml index 6f63f5cebe..620bc80d7d 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml @@ -650,6 +650,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: true add_full_pod_name_metric_label: true add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index 2777967e14..6bce1ffd50 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -1409,6 +1409,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: true + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: true add_full_pod_name_metric_label: true add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_kueue_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_kueue_config.yaml index 9ac6191348..b4ba7af0bc 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_kueue_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_kueue_config.yaml @@ -752,6 +752,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: true add_full_pod_name_metric_label: true add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml index 986ecc1f08..5521dd14ae 100644 --- a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml +++ b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml @@ -607,6 +607,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: true add_full_pod_name_metric_label: true add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/kueue_container_insights_config.yaml b/translator/tocwconfig/sampleConfig/kueue_container_insights_config.yaml index 6db5f681ce..d2a2107e29 100644 --- a/translator/tocwconfig/sampleConfig/kueue_container_insights_config.yaml +++ b/translator/tocwconfig/sampleConfig/kueue_container_insights_config.yaml @@ -296,6 +296,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml b/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml index 50248a3159..79377d6781 100644 --- a/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml +++ b/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml @@ -139,6 +139,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: true + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml index 9fed73f560..cd9c3688be 100644 --- a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml @@ -644,6 +644,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: true add_full_pod_name_metric_label: true add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/prometheus_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/prometheus_and_kubernetes_config.yaml index aac57b548a..89cc44cb6f 100644 --- a/translator/tocwconfig/sampleConfig/prometheus_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/prometheus_and_kubernetes_config.yaml @@ -649,6 +649,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: true add_full_pod_name_metric_label: true add_service_as_attribute: true diff --git a/translator/translate/otel/common/common.go b/translator/translate/otel/common/common.go index 3b2eba8dda..50af9792b8 100644 --- a/translator/translate/otel/common/common.go +++ b/translator/translate/otel/common/common.go @@ -21,67 +21,69 @@ import ( ) const ( - AgentKey = "agent" - DebugKey = "debug" - MetricsKey = "metrics" - LogsKey = "logs" - TracesKey = "traces" - MetricsCollectedKey = "metrics_collected" - LogsCollectedKey = "logs_collected" - TracesCollectedKey = "traces_collected" - MetricsDestinationsKey = "metrics_destinations" - ECSKey = "ecs" - KubernetesKey = "kubernetes" - CloudWatchKey = "cloudwatch" - CloudWatchLogsKey = "cloudwatchlogs" - PrometheusKey = "prometheus" - PrometheusConfigPathKey = "prometheus_config_path" - AMPKey = "amp" - WorkspaceIDKey = "workspace_id" - EMFProcessorKey = "emf_processor" - DisableMetricExtraction = "disable_metric_extraction" - XrayKey = "xray" - OtlpKey = "otlp" - JmxKey = "jmx" - TLSKey = "tls" - Endpoint = "endpoint" - EndpointOverrideKey = "endpoint_override" - RegionOverrideKey = "region_override" - ProxyOverrideKey = "proxy_override" - InsecureKey = "insecure" - LocalModeKey = "local_mode" - CredentialsKey = "credentials" - RoleARNKey = "role_arn" - SigV4Auth = "sigv4auth" - MetricsCollectionIntervalKey = "metrics_collection_interval" - AggregationDimensionsKey = "aggregation_dimensions" - MeasurementKey = "measurement" - DropOriginalMetricsKey = "drop_original_metrics" - ForceFlushIntervalKey = "force_flush_interval" - ContainerInsightsMetricGranularity = "metric_granularity" // replaced with enhanced_container_insights - EnhancedContainerInsights = "enhanced_container_insights" - ResourcesKey = "resources" - PreferFullPodName = "prefer_full_pod_name" - EnableAcceleratedComputeMetric = "accelerated_compute_metrics" - EnableKueueContainerInsights = "kueue_container_insights" - AppendDimensionsKey = "append_dimensions" - Console = "console" - DiskKey = "disk" - DiskIOKey = "diskio" - NetKey = "net" - Emf = "emf" - StructuredLog = "structuredlog" - ServiceAddress = "service_address" - Udp = "udp" - Tcp = "tcp" - TlsKey = "tls" - Tags = "tags" - Region = "region" - LogGroupName = "log_group_name" - LogStreamName = "log_stream_name" - NameKey = "name" - RenameKey = "rename" - UnitKey = "unit" + AgentKey = "agent" + DebugKey = "debug" + MetricsKey = "metrics" + LogsKey = "logs" + TracesKey = "traces" + MetricsCollectedKey = "metrics_collected" + LogsCollectedKey = "logs_collected" + TracesCollectedKey = "traces_collected" + MetricsDestinationsKey = "metrics_destinations" + ECSKey = "ecs" + KubernetesKey = "kubernetes" + CloudWatchKey = "cloudwatch" + CloudWatchLogsKey = "cloudwatchlogs" + PrometheusKey = "prometheus" + PrometheusConfigPathKey = "prometheus_config_path" + AMPKey = "amp" + WorkspaceIDKey = "workspace_id" + EMFProcessorKey = "emf_processor" + DisableMetricExtraction = "disable_metric_extraction" + XrayKey = "xray" + OtlpKey = "otlp" + JmxKey = "jmx" + TLSKey = "tls" + Endpoint = "endpoint" + EndpointOverrideKey = "endpoint_override" + RegionOverrideKey = "region_override" + ProxyOverrideKey = "proxy_override" + InsecureKey = "insecure" + LocalModeKey = "local_mode" + CredentialsKey = "credentials" + RoleARNKey = "role_arn" + SigV4Auth = "sigv4auth" + MetricsCollectionIntervalKey = "metrics_collection_interval" + AggregationDimensionsKey = "aggregation_dimensions" + MeasurementKey = "measurement" + DropOriginalMetricsKey = "drop_original_metrics" + ForceFlushIntervalKey = "force_flush_interval" + ContainerInsightsMetricGranularity = "metric_granularity" // replaced with enhanced_container_insights + EnhancedContainerInsights = "enhanced_container_insights" + ResourcesKey = "resources" + PreferFullPodName = "prefer_full_pod_name" + EnableAcceleratedComputeMetric = "accelerated_compute_metrics" + AcceleratedComputeGPUMetricsCollectionInterval = "accelerated_compute_gpu_metrics_collection_interval" + HighFrequencyGpuMetrics = "high_frequency_gpu_metrics" + EnableKueueContainerInsights = "kueue_container_insights" + AppendDimensionsKey = "append_dimensions" + Console = "console" + DiskKey = "disk" + DiskIOKey = "diskio" + NetKey = "net" + Emf = "emf" + StructuredLog = "structuredlog" + ServiceAddress = "service_address" + UDP = "udp" + TCP = "tcp" + TlsKey = "tls" //nolint:revive + Tags = "tags" + Region = "region" + LogGroupName = "log_group_name" + LogStreamName = "log_stream_name" + NameKey = "name" + RenameKey = "rename" + UnitKey = "unit" ) const ( diff --git a/translator/translate/otel/common/common_test.go b/translator/translate/otel/common/common_test.go index 24b940b13a..32e397e2f0 100644 --- a/translator/translate/otel/common/common_test.go +++ b/translator/translate/otel/common/common_test.go @@ -31,6 +31,8 @@ func (t testTranslator) ID() component.ID { func TestConfigKeys(t *testing.T) { require.Equal(t, "1::2", ConfigKey("1", "2")) + require.Equal(t, "logs::metrics_collected::kubernetes::accelerated_compute_gpu_metrics_collection_interval", + ConfigKey(LogsKey, MetricsCollectedKey, KubernetesKey, AcceleratedComputeGPUMetricsCollectionInterval)) } func TestGetString(t *testing.T) { diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go index d74597bd6f..f4e7ff561a 100644 --- a/translator/translate/otel/exporter/awsemf/kubernetes.go +++ b/translator/translate/otel/exporter/awsemf/kubernetes.go @@ -7,6 +7,7 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter" "go.opentelemetry.io/collector/confmap" + "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" ) @@ -60,7 +61,7 @@ func setKubernetesMetricDeclaration(conf *confmap.Conf, cfg *awsemfexporter.Conf cfg.MetricDeclarations = kubernetesMetricDeclarations cfg.MetricDescriptors = getControlPlaneMetricDescriptors(conf) - + cfg.GaugeMetricsToCompact = getGaugeMetricsToCompact(conf) return nil } @@ -722,3 +723,33 @@ func getVolumesMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDe } return metricDeclarations } + +func getGaugeMetricsToCompact(conf *confmap.Conf) []string { + var metricsToHistogram []string + if awscontainerinsight.IsHighFrequencyGPUMetricsEnabled(conf) { + var gpuMetricTypes = []string{ + containerinsightscommon.TypeGpuContainer, + containerinsightscommon.TypeGpuPod, + containerinsightscommon.TypeGpuNode, + } + + // GPU metrics to be compacted to values and counts + gpuMetrics := []string{ + containerinsightscommon.GpuUtilization, + containerinsightscommon.GpuMemUtilization, + containerinsightscommon.GpuMemTotal, + containerinsightscommon.GpuMemUsed, + containerinsightscommon.GpuPowerDraw, + containerinsightscommon.GpuTemperature, + containerinsightscommon.GpuTensorCoreUtilization, + } + + // Generate metric names by looping through types and metrics + for _, t := range gpuMetricTypes { + for _, m := range gpuMetrics { + metricsToHistogram = append(metricsToHistogram, containerinsightscommon.MetricName(t, m)) + } + } + } + return metricsToHistogram +} diff --git a/translator/translate/otel/pipeline/containerinsights/translator.go b/translator/translate/otel/pipeline/containerinsights/translator.go index 91a6fd6b10..8fbabb10dc 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator.go +++ b/translator/translate/otel/pipeline/containerinsights/translator.go @@ -5,6 +5,7 @@ package containerinsights import ( "fmt" + "time" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap" @@ -17,6 +18,7 @@ import ( "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/batchprocessor" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/filterprocessor" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/gpu" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/groupbyattrsprocessor" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/kueue" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/metricstransformprocessor" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" @@ -58,13 +60,29 @@ func (t *translator) Translate(conf *confmap.Conf) (*common.ComponentTranslators return nil, &common.MissingKeyError{ID: t.ID(), JsonKey: fmt.Sprint(ecsKey, " or ", eksKey)} } + highFrequencyGPUMetricsEnabled := t.pipelineName == ciPipelineName && awscontainerinsight.IsHighFrequencyGPUMetricsEnabled(conf) + // create processor map with - // - default batch processor + // - batch processor (with timeout override if high-frequency GPU metrics are enabled) // - filter processor to drop prometheus metadata + var batchTranslator common.ComponentTranslator + if highFrequencyGPUMetricsEnabled { + // Use 1 minute timeout directly for high-frequency GPU metrics + batchTranslator = batchprocessor.NewTranslatorWithNameSectionAndTimeout(t.pipelineName, common.LogsKey, time.Minute) + } else { + // Use default timeout based on telemetry section + batchTranslator = batchprocessor.NewTranslatorWithNameAndSection(t.pipelineName, common.LogsKey) + } + processors := common.NewTranslatorMap( - batchprocessor.NewTranslatorWithNameAndSection(t.pipelineName, common.LogsKey), + batchTranslator, filterprocessor.NewTranslator(common.WithName(t.pipelineName)), ) + + if highFrequencyGPUMetricsEnabled { + processors.Set(groupbyattrsprocessor.NewTranslatorWithName(t.pipelineName)) + } + // create exporter map with default emf exporter based on pipeline name exporters := common.NewTranslatorMap(awsemf.NewTranslatorWithName(t.pipelineName)) // create extensions map based on pipeline name diff --git a/translator/translate/otel/pipeline/containerinsights/translators_test.go b/translator/translate/otel/pipeline/containerinsights/translators_test.go index 78720fd415..e615fb0305 100644 --- a/translator/translate/otel/pipeline/containerinsights/translators_test.go +++ b/translator/translate/otel/pipeline/containerinsights/translators_test.go @@ -18,8 +18,9 @@ import ( func TestTranslators(t *testing.T) { type want struct { - receivers []string - exporters []string + receivers []string + processors []string + exporters []string } testCases := map[string]struct { input map[string]any @@ -37,8 +38,9 @@ func TestTranslators(t *testing.T) { }, want: map[string]want{ "metrics/containerinsights": { - receivers: []string{"awscontainerinsightreceiver"}, - exporters: []string{"awsemf/containerinsights"}, + receivers: []string{"awscontainerinsightreceiver"}, + processors: []string{"batch/containerinsights", "filter/containerinsights", "awsentity/resource/containerinsights"}, + exporters: []string{"awsemf/containerinsights"}, }, }, }, @@ -55,8 +57,9 @@ func TestTranslators(t *testing.T) { }, want: map[string]want{ "metrics/containerinsights": { - receivers: []string{"awscontainerinsightreceiver"}, - exporters: []string{"awsemf/containerinsights"}, + receivers: []string{"awscontainerinsightreceiver"}, + processors: []string{"batch/containerinsights", "filter/containerinsights", "awsentity/resource/containerinsights", "metricstransform/containerinsights", "gpuattributes/containerinsights"}, + exporters: []string{"awsemf/containerinsights"}, }, }, }, @@ -73,12 +76,35 @@ func TestTranslators(t *testing.T) { }, want: map[string]want{ "metrics/containerinsights": { - receivers: []string{"awscontainerinsightreceiver"}, - exporters: []string{"awsemf/containerinsights"}, + receivers: []string{"awscontainerinsightreceiver"}, + processors: []string{"batch/containerinsights", "filter/containerinsights", "awsentity/resource/containerinsights"}, + exporters: []string{"awsemf/containerinsights"}, }, "metrics/kueueContainerInsights": { - receivers: []string{"awscontainerinsightskueuereceiver"}, - exporters: []string{"awsemf/kueueContainerInsights"}, + receivers: []string{"awscontainerinsightskueuereceiver"}, + processors: []string{"batch/kueueContainerInsights", "filter/kueueContainerInsights", "kueueattributes/kueueContainerInsights"}, + exporters: []string{"awsemf/kueueContainerInsights"}, + }, + }, + }, + "WithEnhancedContainerInsightsAndHighFrequencyGPUMetrics": { + input: map[string]interface{}{ + "logs": map[string]interface{}{ + "metrics_collected": map[string]interface{}{ + "kubernetes": map[string]interface{}{ + "enhanced_container_insights": true, + "accelerated_compute_metrics": true, + "accelerated_compute_gpu_metrics_collection_interval": 30, // 30 seconds, less than default 60s + "cluster_name": "TestCluster", + }, + }, + }, + }, + want: map[string]want{ + "metrics/containerinsights": { + receivers: []string{"awscontainerinsightreceiver"}, + processors: []string{"batch/containerinsights", "filter/containerinsights", "groupbyattrs/containerinsights", "awsentity/resource/containerinsights", "metricstransform/containerinsights", "gpuattributes/containerinsights"}, + exporters: []string{"awsemf/containerinsights"}, }, }, }, @@ -98,6 +124,7 @@ func TestTranslators(t *testing.T) { g, err := tr.Translate(conf) assert.NoError(t, err) assert.Equal(t, w.receivers, collections.MapSlice(g.Receivers.Keys(), component.ID.String)) + assert.Equal(t, w.processors, collections.MapSlice(g.Processors.Keys(), component.ID.String)) assert.Equal(t, w.exporters, collections.MapSlice(g.Exporters.Keys(), component.ID.String)) }) } diff --git a/translator/translate/otel/pipeline/emf_logs/translator.go b/translator/translate/otel/pipeline/emf_logs/translator.go index 221b52fc9b..c7c0bf9b6a 100644 --- a/translator/translate/otel/pipeline/emf_logs/translator.go +++ b/translator/translate/otel/pipeline/emf_logs/translator.go @@ -54,13 +54,13 @@ func (t *translator) Translate(conf *confmap.Conf) (*common.ComponentTranslators ), } if serviceAddress, ok := common.GetString(conf, serviceAddressEMFKey); ok { - if strings.Contains(serviceAddress, common.Udp) { + if strings.Contains(serviceAddress, common.UDP) { translators.Receivers.Set(udplog.NewTranslatorWithName(common.PipelineNameEmfLogs)) } else { translators.Receivers.Set(tcplog.NewTranslatorWithName(common.PipelineNameEmfLogs)) } } else if serviceAddress, ok = common.GetString(conf, serviceAddressStructuredLogKey); ok { - if strings.Contains(serviceAddress, common.Udp) { + if strings.Contains(serviceAddress, common.UDP) { translators.Receivers.Set(udplog.NewTranslatorWithName(common.PipelineNameEmfLogs)) } else { translators.Receivers.Set(tcplog.NewTranslatorWithName(common.PipelineNameEmfLogs)) diff --git a/translator/translate/otel/processor/batchprocessor/translator.go b/translator/translate/otel/processor/batchprocessor/translator.go index af9bef3aa6..2289bf56c0 100644 --- a/translator/translate/otel/processor/batchprocessor/translator.go +++ b/translator/translate/otel/processor/batchprocessor/translator.go @@ -23,13 +23,18 @@ var defaultForceFlushInterval = map[string]time.Duration{ type translator struct { name string telemetrySectionKey string + timeoutOverride *time.Duration factory processor.Factory } var _ common.ComponentTranslator = (*translator)(nil) func NewTranslatorWithNameAndSection(name string, telemetrySectionKey string) common.ComponentTranslator { - return &translator{name, telemetrySectionKey, batchprocessor.NewFactory()} + return &translator{name, telemetrySectionKey, nil, batchprocessor.NewFactory()} +} + +func NewTranslatorWithNameSectionAndTimeout(name string, telemetrySectionKey string, timeout time.Duration) common.ComponentTranslator { + return &translator{name, telemetrySectionKey, &timeout, batchprocessor.NewFactory()} } func (t *translator) ID() component.ID { @@ -38,7 +43,11 @@ func (t *translator) ID() component.ID { func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { cfg := t.factory.CreateDefaultConfig().(*batchprocessor.Config) - if duration, ok := common.GetDuration(conf, common.ConfigKey(t.telemetrySectionKey, common.ForceFlushIntervalKey)); ok { + + // First check if we have a timeout override + if t.timeoutOverride != nil { + cfg.Timeout = *t.timeoutOverride + } else if duration, ok := common.GetDuration(conf, common.ConfigKey(t.telemetrySectionKey, common.ForceFlushIntervalKey)); ok { cfg.Timeout = duration } else if defaultDuration, ok := defaultForceFlushInterval[t.telemetrySectionKey]; ok { cfg.Timeout = defaultDuration diff --git a/translator/translate/otel/processor/batchprocessor/translator_test.go b/translator/translate/otel/processor/batchprocessor/translator_test.go index 4de64a35b6..ceb306377f 100644 --- a/translator/translate/otel/processor/batchprocessor/translator_test.go +++ b/translator/translate/otel/processor/batchprocessor/translator_test.go @@ -77,6 +77,41 @@ func TestTranslator(t *testing.T) { SendBatchMaxSize: 0, }, }, + "TimeoutOverrideMetricsSection": { + translator: NewTranslatorWithNameSectionAndTimeout("test", common.MetricsKey, 45*time.Second), + input: map[string]interface{}{ + "metrics": map[string]interface{}{ + "force_flush_interval": 30, + }, + }, + want: &batchprocessor.Config{ + Timeout: 45 * time.Second, + SendBatchSize: 8192, + SendBatchMaxSize: 0, + }, + }, + "TimeoutOverrideLogsSection": { + translator: NewTranslatorWithNameSectionAndTimeout("test", common.LogsKey, 15*time.Second), + input: map[string]interface{}{ + "logs": map[string]interface{}{}, + }, + want: &batchprocessor.Config{ + Timeout: 15 * time.Second, + SendBatchSize: 8192, + SendBatchMaxSize: 0, + }, + }, + "TimeoutOverrideNotConfiguredSection": { + translator: NewTranslatorWithNameSectionAndTimeout("test", common.TracesKey, 25*time.Second), + input: map[string]interface{}{ + "traces": map[string]interface{}{}, + }, + want: &batchprocessor.Config{ + Timeout: 25 * time.Second, + SendBatchSize: 8192, + SendBatchMaxSize: 0, + }, + }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { diff --git a/translator/translate/otel/processor/groupbyattrsprocessor/translator.go b/translator/translate/otel/processor/groupbyattrsprocessor/translator.go new file mode 100644 index 0000000000..54a986491d --- /dev/null +++ b/translator/translate/otel/processor/groupbyattrsprocessor/translator.go @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package groupbyattrsprocessor + +import ( + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/confmap" + "go.opentelemetry.io/collector/processor" + + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" +) + +type translator struct { + name string + factory processor.Factory +} + +var _ common.ComponentTranslator = (*translator)(nil) + +func NewTranslatorWithName(name string) common.ComponentTranslator { + return &translator{name, groupbyattrsprocessor.NewFactory()} +} + +func (t *translator) ID() component.ID { + return component.NewIDWithName(t.factory.Type(), t.name) +} + +func (t *translator) Translate(_ *confmap.Conf) (component.Config, error) { + cfg := t.factory.CreateDefaultConfig().(*groupbyattrsprocessor.Config) + return cfg, nil +} diff --git a/translator/translate/otel/receiver/awscontainerinsight/translator.go b/translator/translate/otel/receiver/awscontainerinsight/translator.go index ba3566c801..f83dc32fe4 100644 --- a/translator/translate/otel/receiver/awscontainerinsight/translator.go +++ b/translator/translate/otel/receiver/awscontainerinsight/translator.go @@ -8,7 +8,6 @@ import ( "fmt" "os" "strings" - "time" "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver" "go.opentelemetry.io/collector/component" @@ -32,8 +31,7 @@ const ( ecs = "ecs" eks = "eks" - defaultMetricsCollectionInterval = time.Minute - defaultLeaderLockName = "cwagent-clusterleader" // To maintain backwards compatability with https://github.com/aws/amazon-cloudwatch-agent/blob/2dd89abaab4590cffbbc31ef89319b62809b09d1/plugins/inputs/k8sapiserver/k8sapiserver.go#L30 + defaultLeaderLockName = "cwagent-clusterleader" // To maintain backwards compatibility with https://github.com/aws/amazon-cloudwatch-agent/blob/2dd89abaab4590cffbbc31ef89319b62809b09d1/plugins/inputs/k8sapiserver/k8sapiserver.go#L30 ) type translator struct { @@ -82,10 +80,12 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { common.ConfigKey(configuredService.Key, common.MetricsCollectionIntervalKey), common.ConfigKey(common.AgentKey, common.MetricsCollectionIntervalKey), } - cfg.CollectionInterval = common.GetOrDefaultDuration(conf, intervalKeyChain, defaultMetricsCollectionInterval) + cfg.CollectionInterval = common.GetOrDefaultDuration(conf, intervalKeyChain, DefaultMetricsCollectionInterval) cfg.CollectionRole = getCollectionRole() cfg.ContainerOrchestrator = configuredService.Value cfg.AWSSessionSettings.Region = agent.Global_Config.Region + cfg.AcceleratedComputeGPUMetricsCollectionInterval = GetAcceleratedComputeGPUMetricsCollectionInterval(conf) + if profileKey, ok := agent.Global_Config.Credentials[agent.Profile_Key]; ok { cfg.AWSSessionSettings.Profile = fmt.Sprintf("%v", profileKey) } diff --git a/translator/translate/otel/receiver/awscontainerinsight/translator_test.go b/translator/translate/otel/receiver/awscontainerinsight/translator_test.go index 09570dcf46..76b578d2f7 100644 --- a/translator/translate/otel/receiver/awscontainerinsight/translator_test.go +++ b/translator/translate/otel/receiver/awscontainerinsight/translator_test.go @@ -46,6 +46,7 @@ func TestTranslator(t *testing.T) { CollectionInterval: time.Minute, LeaderLockName: "otel-container-insight-clusterleader", TagService: true, + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithECS/WithAgentInterval": { @@ -64,6 +65,7 @@ func TestTranslator(t *testing.T) { CollectionInterval: 20 * time.Second, LeaderLockName: "otel-container-insight-clusterleader", TagService: true, + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithECS/WithSectionInterval": { @@ -84,6 +86,7 @@ func TestTranslator(t *testing.T) { CollectionInterval: 10 * time.Second, LeaderLockName: "otel-container-insight-clusterleader", TagService: true, + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes": { @@ -105,6 +108,7 @@ func TestTranslator(t *testing.T) { LeaderLockUsingConfigMapOnly: true, TagService: true, KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithoutClusterName": { @@ -136,6 +140,7 @@ func TestTranslator(t *testing.T) { LeaderLockUsingConfigMapOnly: true, ClusterName: "TestCluster", KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithCollectionRoleLeader": { @@ -160,6 +165,7 @@ func TestTranslator(t *testing.T) { LeaderLockUsingConfigMapOnly: true, ClusterName: "TestCluster", KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithCollectionRoleNode": { @@ -184,6 +190,7 @@ func TestTranslator(t *testing.T) { LeaderLockUsingConfigMapOnly: true, ClusterName: "TestCluster", KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithEnhancedContainerInsights": { @@ -209,6 +216,7 @@ func TestTranslator(t *testing.T) { AddFullPodNameMetricLabel: true, AddContainerNameMetricLabel: true, KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithLevel1Granularity": { @@ -233,6 +241,7 @@ func TestTranslator(t *testing.T) { AddFullPodNameMetricLabel: false, AddContainerNameMetricLabel: false, KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithLevel2Granularity": { @@ -258,6 +267,7 @@ func TestTranslator(t *testing.T) { AddFullPodNameMetricLabel: true, AddContainerNameMetricLabel: true, KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithLevel3Granularity": { @@ -283,6 +293,7 @@ func TestTranslator(t *testing.T) { AddFullPodNameMetricLabel: true, AddContainerNameMetricLabel: true, KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithECSAndKubernetes": { @@ -304,6 +315,7 @@ func TestTranslator(t *testing.T) { LeaderLockName: "otel-container-insight-clusterleader", LeaderLockUsingConfigMapOnly: false, TagService: true, + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithEKSAndCustomKubeConfigPathHostDetails": { @@ -332,6 +344,36 @@ func TestTranslator(t *testing.T) { HostName: "test-hostname", HostIP: "1.2.3.4", RunOnSystemd: true, + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value + }, + }, + "WithKubernetes/WithEnhancedContainerInsights/WithHighFrequencyGPUMetrics": { + input: map[string]interface{}{ + "logs": map[string]interface{}{ + "metrics_collected": map[string]interface{}{ + "kubernetes": map[string]interface{}{ + "enhanced_container_insights": true, + "accelerated_compute_metrics": true, + "accelerated_compute_gpu_metrics_collection_interval": 30, // 30 seconds, less than default 60s + "cluster_name": "TestCluster", + }, + }, + }, + }, + want: &awscontainerinsightreceiver.Config{ + ContainerOrchestrator: eks, + CollectionInterval: 60 * time.Second, + PrefFullPodName: true, + LeaderLockName: defaultLeaderLockName, + LeaderLockUsingConfigMapOnly: true, + ClusterName: "TestCluster", + TagService: true, + EnableControlPlaneMetrics: true, + AddFullPodNameMetricLabel: true, + AddContainerNameMetricLabel: true, + KubeConfigPath: "", + EnableAcceleratedComputeMetrics: true, + AcceleratedComputeGPUMetricsCollectionInterval: 30 * time.Second, // Custom value }, }, } @@ -364,6 +406,7 @@ func TestTranslator(t *testing.T) { require.Equal(t, testCase.want.HostName, gotCfg.HostName) require.Equal(t, testCase.want.HostIP, gotCfg.HostIP) require.Equal(t, testCase.want.RunOnSystemd, gotCfg.RunOnSystemd) + require.Equal(t, testCase.want.AcceleratedComputeGPUMetricsCollectionInterval, gotCfg.AcceleratedComputeGPUMetricsCollectionInterval) } }) } diff --git a/translator/translate/otel/receiver/awscontainerinsight/utils.go b/translator/translate/otel/receiver/awscontainerinsight/utils.go index 721951b056..37daf25e66 100644 --- a/translator/translate/otel/receiver/awscontainerinsight/utils.go +++ b/translator/translate/otel/receiver/awscontainerinsight/utils.go @@ -4,13 +4,16 @@ package awscontainerinsight import ( + "time" + "go.opentelemetry.io/collector/confmap" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" ) const ( - BaseContainerInsights = iota + 1 + BaseContainerInsights = iota + 1 + DefaultMetricsCollectionInterval = time.Minute ) func EnhancedContainerInsightsEnabled(conf *confmap.Conf) bool { @@ -27,3 +30,15 @@ func EnhancedContainerInsightsEnabled(conf *confmap.Conf) bool { func AcceleratedComputeMetricsEnabled(conf *confmap.Conf) bool { return common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableAcceleratedComputeMetric), true) } + +func GetAcceleratedComputeGPUMetricsCollectionInterval(conf *confmap.Conf) time.Duration { + return common.GetOrDefaultDuration(conf, []string{ + common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.AcceleratedComputeGPUMetricsCollectionInterval), + }, DefaultMetricsCollectionInterval) +} + +func IsHighFrequencyGPUMetricsEnabled(conf *confmap.Conf) bool { + return EnhancedContainerInsightsEnabled(conf) && + AcceleratedComputeMetricsEnabled(conf) && + GetAcceleratedComputeGPUMetricsCollectionInterval(conf) < DefaultMetricsCollectionInterval +} diff --git a/translator/translate/otel/receiver/tcplog/translator.go b/translator/translate/otel/receiver/tcplog/translator.go index 3db0906c74..2863802d4f 100644 --- a/translator/translate/otel/receiver/tcplog/translator.go +++ b/translator/translate/otel/receiver/tcplog/translator.go @@ -61,7 +61,7 @@ func (t *translator) ID() component.ID { // tcp:localhost:25888 func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { if !conf.IsSet(baseKey) || - (conf.IsSet(common.ConfigKey(serviceAddressKey)) && !strings.Contains(fmt.Sprintf("%v", conf.Get(serviceAddressKey)), common.Tcp)) { + (conf.IsSet(common.ConfigKey(serviceAddressKey)) && !strings.Contains(fmt.Sprintf("%v", conf.Get(serviceAddressKey)), common.TCP)) { return nil, &common.MissingKeyError{ID: t.ID(), JsonKey: fmt.Sprintf("missing %s or tcp service address", baseKey)} } cfg := t.factory.CreateDefaultConfig().(*tcplogreceiver.TCPLogConfig) diff --git a/translator/translate/otel/receiver/udplog/translator.go b/translator/translate/otel/receiver/udplog/translator.go index c315bf9818..7b163c0e99 100644 --- a/translator/translate/otel/receiver/udplog/translator.go +++ b/translator/translate/otel/receiver/udplog/translator.go @@ -61,7 +61,7 @@ func (t *translator) ID() component.ID { // udp:localhost:25888 func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { if !conf.IsSet(baseKey) || - (conf.IsSet(common.ConfigKey(serviceAddressKey)) && !strings.Contains(fmt.Sprintf("%v", conf.Get(serviceAddressKey)), common.Udp)) { + (conf.IsSet(common.ConfigKey(serviceAddressKey)) && !strings.Contains(fmt.Sprintf("%v", conf.Get(serviceAddressKey)), common.UDP)) { return nil, &common.MissingKeyError{ID: t.ID(), JsonKey: fmt.Sprintf("missing %s or udp service address", baseKey)} } cfg := t.factory.CreateDefaultConfig().(*udplogreceiver.UDPLogConfig)