Skip to content

Conversation

@yanhaoluo666
Copy link
Collaborator

@yanhaoluo666 yanhaoluo666 commented Oct 13, 2025

Note

This PR is part of enabling high frequency gpu metrics, other related PRs are 371 and 1893.

Description

This PR introduces a new config field - accelerated_compute_gpu_metrics_collection_interval to let customer configure gpu metrics collection interval based on their own use case. For example, setting this value to 1 would denote cloudwatch agent to scrape gpu metrics every second.

Testing

  1. set up this field in helm-chat configuration as follows:
agents:
  - name: cloudwatch-agent
agent:
  name:
  mode: daemonset # Represents the mode the AmazonCloudWatchAgent workload will run in (deployment, daemonset or statefulset)
  replicas: 1
  defaultConfig:
    {
      "logs": {
        "metrics_collected": {
          "kubernetes": {
            "enhanced_container_insights": true,
            "accelerated_compute_gpu_metrics_collection_interval": 1

          },
        }
      },
      "traces": {
        "traces_collected": {
          "application_signals": { }
        }
      }
    }
  1. deploy agent code and helm chart and verified there were 60 datapoints each min.
{
  "CloudWatchMetrics": [
      {
          "Namespace": "ContainerInsights",
          "Dimensions": [
              [
                  "ClusterName"
              ],
              [
                  "ClusterName",
                  "ContainerName",
                  "Namespace",
                  "PodName"
              ],
              [
                  "ClusterName",
                  "ContainerName",
                  "FullPodName",
                  "Namespace",
                  "PodName"
              ],
              [
                  "ClusterName",
                  "ContainerName",
                  "FullPodName",
                  "GpuDevice",
                  "Namespace",
                  "PodName"
              ]
          ],
          "Metrics": [
              {
                  "Name": "container_gpu_temperature",
                  "Unit": "None",
                  "StorageResolution": 60
              },
              {
                  "Name": "container_gpu_power_draw",
                  "Unit": "None",
                  "StorageResolution": 60
              },
              {
                  "Name": "container_gpu_utilization",
                  "Unit": "Percent",
                  "StorageResolution": 60
              },
              {
                  "Name": "container_gpu_memory_utilization",
                  "Unit": "Percent",
                  "StorageResolution": 60
              },
              {
                  "Name": "container_gpu_memory_used",
                  "Unit": "Bytes",
                  "StorageResolution": 60
              },
              {
                  "Name": "container_gpu_memory_total",
                  "Unit": "Bytes",
                  "StorageResolution": 60
              }
          ]
      }
  ],
  "ClusterName": "cpipeline",
  "ContainerName": "main",
  "FullPodName": "gpu-burn-577f5d7468-4j54s",
  "GpuDevice": "nvidia0",
  "InstanceId": "i-0f01fff8faa360227",
  "InstanceType": "g4dn.xlarge",
  "Namespace": "kube-system",
  "NodeName": "ip-192-168-6-219.ec2.internal",
  "PodName": "gpu-burn",
  "Sources": [
      "dcgm",
      "pod",
      "calculated"
  ],
  "Timestamp": "1760375344178",
  "Type": "ContainerGPU",
  "UUID": "GPU-60efa417-4d26-c4ba-9e62-66249559952d",
  "Version": "0",
  "kubernetes": {
      "container_name": "main",
      "containerd": {
          "container_id": "5bfc51b6805d8bdc96e34f262394ae2702cc5d55ad186c660acbef414aa86223"
      },
      "host": "ip-192-168-6-219.ec2.internal",
      "labels": {
          "app": "gpu-burn",
          "pod-template-hash": "577f5d7468"
      },
      "pod_name": "gpu-burn-577f5d7468-4j54s",
      "pod_owners": [
          {
              "owner_kind": "Deployment",
              "owner_name": "gpu-burn"
          }
      ]
  },
  "container_gpu_memory_total": {
      "Values": [
          16006027360
      ],
      "Counts": [
          60
      ],
      "Max": 16006027360,
      "Min": 16006027360,
      "Count": 60,
      "Sum": 982473768960
  },
  "container_gpu_memory_used": {
      "Values": [
          0,
          176060768,
          245366784,
          14254342144,
          253755392,
          111149056,
          207608048,
          251658240
      ],
      "Counts": [
          8,
          1,
          1,
          46,
          1,
          1,
          1,
          1
      ],
      "Max": 14254342144,
      "Min": 0,
      "Count": 60,
      "Sum": 656945446912
  },
  "container_gpu_memory_utilization": {
      "Values": [
          1.185,
          0.9862,
          90.0607,
          1.609,
          0.6948,
          1.3572000000000002,
          1.5559999999999998,
          0
      ],
      "Counts": [
          1,
          1,
          46,
          1,
          1,
          1,
          1,
          8
      ],
      "Max": 90.0607,
      "Min": 0,
      "Count": 60,
      "Sum": 4150.226400000004
  },
  "container_gpu_power_draw": {
      "Values": [
          32.662,
          70.563,
          69.099,
          32.760,
          69.49,
          33.549,
          69.978,
          69.197,
          33.844,
          63.907,
          65.919,
          70.368,
          70.27,
          38.921,
          69.435,
          68.360,
          69.88,
          70.173,
          68.318,
          70.119,
          67.872,
          70.466,
          65.626,
          67.97,
          69.826,
          32.859,
          33.352,
          70.660,
          70.075,
          33.253,
          69.294,
          69.587,
          68.904,
          38.429,
          82.459,
          69.685,
          69.392,
          68.849,
          69.782,
          68.458
      ],
      "Counts": [
          2,
          2,
          1,
          1,
          1,
          1,
          4,
          1,
          1,
          1,
          1,
          1,
          3,
          1,
          1,
          1,
          3,
          1,
          1,
          1,
          1,
          1,
          1,
          1,
          1,
          4,
          1,
          3,
          2,
          2,
          1,
          1,
          1,
          1,
          1,
          4,
          1,
          1,
          2,
          1
      ],
      "Max": 82.459,
      "Min": 32.662,
      "Count": 60,
      "Sum": 3748.8209999999995
  },
  "container_gpu_temperature": {
      "Values": [
          42,
          43,
          44
      ],
      "Counts": [
          12,
          32,
          16
      ],
      "Max": 44,
      "Min": 42,
      "Count": 60,
      "Sum": 2628
  },
  "container_gpu_utilization": {
      "Values": [
          96,
          6,
          8,
          14,
          58,
          0,
          64,
          9,
          89,
          7,
          100
      ],
      "Counts": [
          1,
          1,
          1,
          1,
          1,
          6,
          1,
          1,
          1,
          2,
          44
      ],
      "Max": 100,
      "Min": 0,
      "Count": 60,
      "Sum": 4858
  }
}

…lection_interval config to support gpu metrics collection interval customization
@yanhaoluo666 yanhaoluo666 force-pushed the feature/gpu-metrics-high-sampling branch from 385c069 to f8ddd89 Compare October 21, 2025 12:00
@yanhaoluo666 yanhaoluo666 merged commit db57f66 into amazon-contributing:aws-cwa-dev Oct 21, 2025
272 of 274 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants