|
| 1 | +/* |
| 2 | + * |
| 3 | + * Copyright (C) 2025 Robert Dyro <robert.dyro@gmail.com> |
| 4 | + * |
| 5 | + * This file is part of Nvtop |
| 6 | + * |
| 7 | + * Nvtop is free software: you can redistribute it and/or modify |
| 8 | + * it under the terms of the GNU General Public License as published by |
| 9 | + * the Free Software Foundation, either version 3 of the License, or |
| 10 | + * (at your option) any later version. |
| 11 | + * |
| 12 | + * Nvtop is distributed in the hope that it will be useful, |
| 13 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 15 | + * GNU General Public License for more details. |
| 16 | + * |
| 17 | + * You should have received a copy of the GNU General Public License |
| 18 | + * along with nvtop. If not, see <http://www.gnu.org/licenses/>. |
| 19 | + * |
| 20 | + */ |
| 21 | + |
| 22 | +#include "nvtop/extract_gpuinfo_common.h" |
| 23 | +#include "nvtop/time.h" |
| 24 | + |
| 25 | +#include <fcntl.h> |
| 26 | +#include <stdlib.h> |
| 27 | +#include <stdio.h> |
| 28 | +#include <string.h> |
| 29 | +#include <unistd.h> |
| 30 | +#include <math.h> |
| 31 | +#include <dlfcn.h> |
| 32 | +#include <sys/time.h> |
| 33 | + |
| 34 | +struct gpu_info_tpu { |
| 35 | + struct gpu_info base; |
| 36 | + int device_id; |
| 37 | +}; |
| 38 | + |
| 39 | +struct tpu_chip_usage_data { |
| 40 | + char name[8]; |
| 41 | + int64_t device_id; |
| 42 | + int64_t memory_usage; |
| 43 | + int64_t total_memory; |
| 44 | + double duty_cycle_pct; |
| 45 | + int64_t pid; |
| 46 | +}; |
| 47 | + |
| 48 | +static bool gpuinfo_tpu_init(void); |
| 49 | +static void gpuinfo_tpu_shutdown(void); |
| 50 | +static const char *gpuinfo_tpu_last_error_string(void); |
| 51 | +static bool gpuinfo_tpu_get_device_handles(struct list_head *devices, unsigned *count); |
| 52 | +static void gpuinfo_tpu_populate_static_info(struct gpu_info *_gpu_info); |
| 53 | +static void gpuinfo_tpu_refresh_dynamic_info(struct gpu_info *_gpu_info); |
| 54 | +static void gpuinfo_tpu_get_running_processes(struct gpu_info *_gpu_info); |
| 55 | +static bool is_cache_valid(void); |
| 56 | +static bool refresh_tpu_cache(void); |
| 57 | +static void reset_tpu_cache(bool); |
| 58 | +static void free_ptr(void **ptr); |
| 59 | + |
| 60 | +struct gpu_vendor gpu_vendor_tpu = { |
| 61 | + .init = gpuinfo_tpu_init, |
| 62 | + .shutdown = gpuinfo_tpu_shutdown, |
| 63 | + .last_error_string = gpuinfo_tpu_last_error_string, |
| 64 | + .get_device_handles = gpuinfo_tpu_get_device_handles, |
| 65 | + .populate_static_info = gpuinfo_tpu_populate_static_info, |
| 66 | + .refresh_dynamic_info = gpuinfo_tpu_refresh_dynamic_info, |
| 67 | + .refresh_running_processes = gpuinfo_tpu_get_running_processes, |
| 68 | + .name = "TPU", |
| 69 | +}; |
| 70 | + |
| 71 | +__attribute__((constructor)) static void init_extract_gpuinfo_tpu(void) { |
| 72 | + register_gpu_vendor(&gpu_vendor_tpu); |
| 73 | +} |
| 74 | + |
| 75 | +int64_t tpu_chip_count = -1; |
| 76 | +static struct gpu_info_tpu *gpu_infos; |
| 77 | + |
| 78 | +#define STRINGIFY(x) STRINGIFY_HELPER_(x) |
| 79 | +#define STRINGIFY_HELPER_(x) #x |
| 80 | + |
| 81 | +#define VENDOR_TPU 0x1ae0 |
| 82 | +#define VENDOR_TPU_STR STRINGIFY(VENDOR_TPU) |
| 83 | + |
| 84 | +#define MAX(x, y) ((x >= y) ? (x) : (y)) |
| 85 | +#define MIN(x, y) ((x <= y) ? (x) : (y)) |
| 86 | + |
| 87 | +#define int64 long long |
| 88 | + |
| 89 | +int (*_tpu_chip_count)(void); |
| 90 | +int (*_tpu_metrics)(int port, int64 *device_ids, int64 *memory_usage, |
| 91 | + int64 *total_memory, double *duty_cycle_pct, int n); |
| 92 | +int (*_tpu_pids)(int64 *pids, int n); |
| 93 | + |
| 94 | +char *libname = "libtpuinfo.so"; |
| 95 | +// -1 means allowing libtpuinfo to select the default port |
| 96 | +// env LIBTPUINFO_GRPC_PORT={int} allows setting the port via an environment variable |
| 97 | +// $ env LIBTPUINFO_GRPC_PORT=8431 nvtop |
| 98 | +int tpu_runtime_monitoring_port = -1; |
| 99 | + |
| 100 | +/* TPU info cache ------------------------------------------------------------------------------- */ |
| 101 | +struct tpu_chip_usage_data *latest_chips_usage_data = NULL; |
| 102 | +nvtop_time last_cache_refresh; |
| 103 | +int64 *_pids, *_device_ids, *_memory_usage, *_total_memory; |
| 104 | +double* _duty_cycle_pct; |
| 105 | + |
| 106 | +bool is_cache_valid(void) { |
| 107 | + nvtop_time current_time; |
| 108 | + nvtop_get_current_time(¤t_time); |
| 109 | + uint64_t t_diff_ns = nvtop_difftime_u64(last_cache_refresh, current_time); |
| 110 | + return t_diff_ns < 900 * 1000 * 1000; // 900ms |
| 111 | +} |
| 112 | + |
| 113 | +bool refresh_tpu_cache(void) { |
| 114 | + if (is_cache_valid()) return true; |
| 115 | + nvtop_get_current_time(&last_cache_refresh); |
| 116 | + if (tpu_chip_count <= 0) return false; |
| 117 | + if (_tpu_pids(_pids, tpu_chip_count) != 0) { |
| 118 | + reset_tpu_cache(false); |
| 119 | + return false; |
| 120 | + } |
| 121 | + for (int64_t i = 0; i < tpu_chip_count; i++) latest_chips_usage_data[i].pid = _pids[i]; |
| 122 | + |
| 123 | + if (_tpu_metrics(tpu_runtime_monitoring_port, _device_ids, _memory_usage, _total_memory, |
| 124 | + _duty_cycle_pct, tpu_chip_count) != 0) return false; |
| 125 | + for (int64_t i = 0; i < tpu_chip_count; i++) { |
| 126 | + latest_chips_usage_data[i].device_id = _device_ids[i]; |
| 127 | + latest_chips_usage_data[i].memory_usage = _memory_usage[i]; |
| 128 | + latest_chips_usage_data[i].total_memory = _total_memory[i]; |
| 129 | + latest_chips_usage_data[i].duty_cycle_pct = _duty_cycle_pct[i]; |
| 130 | + } |
| 131 | + return true; |
| 132 | +} |
| 133 | + |
| 134 | +void reset_tpu_cache(bool fully) { |
| 135 | + for (int64_t i = 0; i < tpu_chip_count; i++) { |
| 136 | + latest_chips_usage_data[i].memory_usage = 0; |
| 137 | + latest_chips_usage_data[i].duty_cycle_pct = 0; |
| 138 | + latest_chips_usage_data[i].pid = -1; |
| 139 | + if (fully) { |
| 140 | + snprintf(latest_chips_usage_data[i].name, sizeof(latest_chips_usage_data[i].name), "%s", "N/A"); |
| 141 | + latest_chips_usage_data[i].device_id = 0; |
| 142 | + latest_chips_usage_data[i].total_memory = 0; |
| 143 | + } |
| 144 | + } |
| 145 | +} |
| 146 | +/* TPU info cache ------------------------------------------------------------------------------- */ |
| 147 | + |
| 148 | +bool gpuinfo_tpu_init(void) { |
| 149 | + char* error_msg; |
| 150 | + nvtop_get_current_time(&last_cache_refresh); |
| 151 | + // invalidate cache by putting it in the past |
| 152 | + last_cache_refresh = nvtop_substract_time(last_cache_refresh, (nvtop_time){10, 0}); |
| 153 | + |
| 154 | + // Load dynamic library symbols |
| 155 | + void *handle = dlopen(libname, RTLD_LAZY); |
| 156 | + if (!handle) { |
| 157 | + error_msg = dlerror(); |
| 158 | +#ifndef NDEBUG |
| 159 | + if (error_msg != NULL) fprintf(stderr, "TPU support error: %s\n", error_msg); |
| 160 | +#endif |
| 161 | + return false; |
| 162 | + } |
| 163 | + |
| 164 | + // Resolve the necessary symbols within the library |
| 165 | + _tpu_chip_count = dlsym(handle, "tpu_chip_count"); |
| 166 | + error_msg = dlerror(); |
| 167 | + if (error_msg != NULL) { |
| 168 | +#ifndef NDEBUG |
| 169 | + fprintf(stderr, "libtpuinfo can't resolve symbol `tpu_chip_count` with error: %s\n", error_msg); |
| 170 | +#endif |
| 171 | + return false; |
| 172 | + } |
| 173 | + _tpu_pids = dlsym(handle, "tpu_pids"); |
| 174 | + error_msg = dlerror(); |
| 175 | + if (error_msg != NULL) { |
| 176 | +#ifndef NDEBUG |
| 177 | + fprintf(stderr, "libtpuinfo can't resolve symbol `tpu_pids` with error: %s\n", error_msg); |
| 178 | +#endif |
| 179 | + return false; |
| 180 | + } |
| 181 | + _tpu_metrics = dlsym(handle, "tpu_metrics"); |
| 182 | + error_msg = dlerror(); |
| 183 | + if (error_msg != NULL) { |
| 184 | +#ifndef NDEBUG |
| 185 | + fprintf(stderr, "libtpuinfo can't resolve symbol `tpu_metrics` with error: %s\n", error_msg); |
| 186 | +#endif |
| 187 | + return false; |
| 188 | + } |
| 189 | + |
| 190 | + // Discover TPU devices |
| 191 | + tpu_chip_count = _tpu_chip_count(); |
| 192 | + if (tpu_chip_count == 0) { |
| 193 | +#ifndef NDEBUG |
| 194 | + fprintf(stderr, "Found 0 TPU devices on the system.\n"); |
| 195 | +#endif |
| 196 | + return false; |
| 197 | + } |
| 198 | + |
| 199 | + // Allocate memory for TPU device data cache |
| 200 | + latest_chips_usage_data = (struct tpu_chip_usage_data*)malloc(tpu_chip_count*sizeof(struct tpu_chip_usage_data)); |
| 201 | + _pids = (int64*)malloc(sizeof(int64) * tpu_chip_count); |
| 202 | + _device_ids = (int64*)malloc(sizeof(int64) * tpu_chip_count); |
| 203 | + _memory_usage = (int64*)malloc(sizeof(int64) * tpu_chip_count); |
| 204 | + _total_memory = (int64*)malloc(sizeof(int64) * tpu_chip_count); |
| 205 | + _duty_cycle_pct = (double*)malloc(sizeof(double) * tpu_chip_count); |
| 206 | + reset_tpu_cache(true); |
| 207 | + return true; |
| 208 | +} |
| 209 | + |
| 210 | +void free_ptr(void **ptr) { |
| 211 | + if (ptr != NULL && *ptr != NULL) { |
| 212 | + free(*ptr); |
| 213 | + *ptr = NULL; |
| 214 | + } |
| 215 | +} |
| 216 | + |
| 217 | +void gpuinfo_tpu_shutdown(void) { |
| 218 | + free_ptr((void **)&gpu_infos); |
| 219 | + free_ptr((void **)&latest_chips_usage_data); |
| 220 | + free_ptr((void **)&_pids); |
| 221 | + free_ptr((void **)&_device_ids); |
| 222 | + free_ptr((void **)&_memory_usage); |
| 223 | + free_ptr((void **)&_total_memory); |
| 224 | + free_ptr((void **)&_duty_cycle_pct); |
| 225 | + tpu_chip_count = -1; |
| 226 | +} |
| 227 | + |
| 228 | +const char *gpuinfo_tpu_last_error_string(void) { return "Err"; } |
| 229 | + |
| 230 | +static void add_tpu_chip(struct list_head *devices, unsigned *count) { |
| 231 | + struct gpu_info_tpu *this_tpu = &gpu_infos[*count]; |
| 232 | + this_tpu->base.vendor = &gpu_vendor_tpu; |
| 233 | + this_tpu->device_id = *count; |
| 234 | + snprintf(this_tpu->base.pdev, PDEV_LEN, "TPU%u", *count); |
| 235 | + list_add_tail(&this_tpu->base.list, devices); |
| 236 | + |
| 237 | + this_tpu->base.processes_count = 0; |
| 238 | + this_tpu->base.processes = NULL; |
| 239 | + this_tpu->base.processes_array_size = 0; |
| 240 | + |
| 241 | + *count = *count + 1; |
| 242 | +} |
| 243 | + |
| 244 | +bool gpuinfo_tpu_get_device_handles(struct list_head *devices_list, unsigned *count) { |
| 245 | + *count = 0; |
| 246 | + if (tpu_chip_count <= 0) return false; |
| 247 | + gpu_infos = (struct gpu_info_tpu *)calloc(tpu_chip_count, sizeof(*gpu_infos)); |
| 248 | + if (!gpu_infos) return false; |
| 249 | + for (int64_t i = 0; i < tpu_chip_count; i++) add_tpu_chip(devices_list, count); |
| 250 | + return true; |
| 251 | +} |
| 252 | + |
| 253 | +void gpuinfo_tpu_populate_static_info(struct gpu_info *_gpu_info) { |
| 254 | + struct gpu_info_tpu *gpu_info = container_of(_gpu_info, struct gpu_info_tpu, base); |
| 255 | + struct gpuinfo_static_info *static_info = &gpu_info->base.static_info; |
| 256 | + static_info->integrated_graphics = false; |
| 257 | + static_info->encode_decode_shared = false; |
| 258 | + RESET_ALL(static_info->valid); |
| 259 | + snprintf(static_info->device_name, MIN(sizeof(static_info->device_name), PDEV_LEN), "%s", gpu_info->base.pdev); |
| 260 | + SET_VALID(gpuinfo_device_name_valid, static_info->valid); |
| 261 | +} |
| 262 | + |
| 263 | +void gpuinfo_tpu_refresh_dynamic_info(struct gpu_info *_gpu_info) { |
| 264 | + struct gpu_info_tpu *gpu_info = container_of(_gpu_info, struct gpu_info_tpu, base); |
| 265 | + // struct gpuinfo_static_info *static_info = &gpu_info->base.static_info; // unused |
| 266 | + struct gpuinfo_dynamic_info *dynamic_info = &gpu_info->base.dynamic_info; |
| 267 | + |
| 268 | + refresh_tpu_cache(); |
| 269 | + |
| 270 | + if (gpu_info->device_id >= tpu_chip_count) return; |
| 271 | + struct tpu_chip_usage_data usage_data = latest_chips_usage_data[gpu_info->device_id]; |
| 272 | + double mem_util = round(1e2 * (double)(usage_data.memory_usage) / (double)MAX(1, usage_data.total_memory)); |
| 273 | + double tpu_util = round(usage_data.duty_cycle_pct); |
| 274 | + SET_GPUINFO_DYNAMIC(dynamic_info, gpu_util_rate, (int)tpu_util); |
| 275 | + SET_GPUINFO_DYNAMIC(dynamic_info, mem_util_rate, (int)mem_util); |
| 276 | + SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, usage_data.total_memory); |
| 277 | + SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, usage_data.memory_usage); |
| 278 | + SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, usage_data.total_memory - usage_data.memory_usage); |
| 279 | + |
| 280 | + return; |
| 281 | +} |
| 282 | + |
| 283 | +void gpuinfo_tpu_get_running_processes(struct gpu_info *_gpu_info) { |
| 284 | + struct gpu_info_tpu *gpu_info = container_of(_gpu_info, struct gpu_info_tpu, base); |
| 285 | + if (gpu_info->device_id >= tpu_chip_count) return; |
| 286 | + if (tpu_chip_count <= 0 || latest_chips_usage_data[gpu_info->device_id].pid < 0) { |
| 287 | + _gpu_info->processes_count = 0; |
| 288 | + return; |
| 289 | + } |
| 290 | + _gpu_info->processes_count = 1; |
| 291 | + if (_gpu_info->processes_array_size == 0) { |
| 292 | + _gpu_info->processes_array_size = 1; |
| 293 | + _gpu_info->processes = (struct gpu_process*)malloc(1 * sizeof(struct gpu_process)); |
| 294 | + memset(_gpu_info->processes, 0, _gpu_info->processes_count * sizeof(*_gpu_info->processes)); |
| 295 | + } |
| 296 | + _gpu_info->processes[0].type = gpu_process_compute; |
| 297 | + _gpu_info->processes[0].pid = latest_chips_usage_data[gpu_info->device_id].pid; |
| 298 | + _gpu_info->processes[0].gpu_memory_usage = _gpu_info->dynamic_info.used_memory; |
| 299 | + |
| 300 | + SET_VALID(gpuinfo_process_gpu_memory_usage_valid, _gpu_info->processes[0].valid); |
| 301 | +} |
0 commit comments