Skip to content

Commit 13963be

Browse files
authored
Merge pull request #360 from rdyro/libtpuinfo
Adding TPU support via libtpuinfo
2 parents ca52d04 + c412a67 commit 13963be

File tree

3 files changed

+332
-0
lines changed

3 files changed

+332
-0
lines changed

CMakeLists.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,23 @@ else()
9191
set(ASCEND_SUPPORT_DEFAULT OFF)
9292
endif()
9393

94+
# TPU support is only available on Linux
95+
if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
96+
# Check for libtpuinfo.so to set the default for TPU support
97+
find_library(LIBTPUINFO
98+
NAMES libtpuinfo.so
99+
PATHS /usr/lib /usr/lib64 /usr/local/lib /usr/local/lib64
100+
HINTS ${CMAKE_INSTALL_PREFIX}/lib ${CMAKE_INSTALL_PREFIX}/lib64 lib lib64
101+
)
102+
if (NOT LIBTPUINFO)
103+
set(TPU_SUPPORT_DEFAULT OFF)
104+
else()
105+
set(TPU_SUPPORT_DEFAULT ON)
106+
endif()
107+
else()
108+
set(TPU_SUPPORT_DEFAULT OFF)
109+
endif()
110+
94111
option(NVIDIA_SUPPORT "Build support for NVIDIA GPUs through libnvml" ${NVIDIA_SUPPORT_DEFAULT})
95112
option(AMDGPU_SUPPORT "Build support for AMD GPUs through amdgpu driver" ${AMDGPU_SUPPORT_DEFAULT})
96113
option(INTEL_SUPPORT "Build support for Intel GPUs through i915 or xe driver" ${INTEL_SUPPORT_DEFAULT})
@@ -100,6 +117,7 @@ option(PANFROST_SUPPORT "Build support for Mali GPUs through panfrost driver" ${
100117
option(PANTHOR_SUPPORT "Build support for Mali GPUs through panthor driver" ${PANTHOR_SUPPORT_DEFAULT})
101118
option(ASCEND_SUPPORT "Build support for Ascend NPUs through Ascend DCMI" ${ASCEND_SUPPORT_DEFAULT})
102119
option(V3D_SUPPORT "Build support for Raspberrypi through v3d" ${V3D_SUPPORT_DEFAULT})
120+
option(TPU_SUPPORT "Build support for Google TPUs through GRPC" ${TPU_SUPPORT_DEFAULT})
103121

104122
add_subdirectory(src)
105123

src/CMakeLists.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,19 @@ if ((PANFROST_SUPPORT) OR (PANTHOR_SUPPORT))
132132
target_sources(nvtop PRIVATE extract_gpuinfo_mali_common.c)
133133
endif()
134134

135+
if(TPU_SUPPORT)
136+
find_library(LIBTPUINFO
137+
NAMES libtpuinfo.so
138+
PATHS /usr/lib /usr/lib64 /usr/local/lib /usr/local/lib64
139+
HINTS ${CMAKE_INSTALL_PREFIX}/lib ${CMAKE_INSTALL_PREFIX}/lib64 lib lib64
140+
)
141+
if (NOT LIBTPUINFO)
142+
message(WARNING "TPU Support enabled, but libtpuinfo.so not found in ldconfig path, we will not be able to read TPU usage")
143+
set(TPU_SUPPORT_DEFAULT OFF)
144+
endif()
145+
target_sources(nvtop PRIVATE extract_gpuinfo_tpu.c)
146+
endif()
147+
135148
target_include_directories(nvtop PRIVATE
136149
${PROJECT_SOURCE_DIR}/include
137150
${PROJECT_BINARY_DIR}/include)

src/extract_gpuinfo_tpu.c

Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
/*
2+
*
3+
* Copyright (C) 2025 Robert Dyro <robert.dyro@gmail.com>
4+
*
5+
* This file is part of Nvtop
6+
*
7+
* Nvtop is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* Nvtop is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with nvtop. If not, see <http://www.gnu.org/licenses/>.
19+
*
20+
*/
21+
22+
#include "nvtop/extract_gpuinfo_common.h"
23+
#include "nvtop/time.h"
24+
25+
#include <fcntl.h>
26+
#include <stdlib.h>
27+
#include <stdio.h>
28+
#include <string.h>
29+
#include <unistd.h>
30+
#include <math.h>
31+
#include <dlfcn.h>
32+
#include <sys/time.h>
33+
34+
struct gpu_info_tpu {
35+
struct gpu_info base;
36+
int device_id;
37+
};
38+
39+
struct tpu_chip_usage_data {
40+
char name[8];
41+
int64_t device_id;
42+
int64_t memory_usage;
43+
int64_t total_memory;
44+
double duty_cycle_pct;
45+
int64_t pid;
46+
};
47+
48+
static bool gpuinfo_tpu_init(void);
49+
static void gpuinfo_tpu_shutdown(void);
50+
static const char *gpuinfo_tpu_last_error_string(void);
51+
static bool gpuinfo_tpu_get_device_handles(struct list_head *devices, unsigned *count);
52+
static void gpuinfo_tpu_populate_static_info(struct gpu_info *_gpu_info);
53+
static void gpuinfo_tpu_refresh_dynamic_info(struct gpu_info *_gpu_info);
54+
static void gpuinfo_tpu_get_running_processes(struct gpu_info *_gpu_info);
55+
static bool is_cache_valid(void);
56+
static bool refresh_tpu_cache(void);
57+
static void reset_tpu_cache(bool);
58+
static void free_ptr(void **ptr);
59+
60+
struct gpu_vendor gpu_vendor_tpu = {
61+
.init = gpuinfo_tpu_init,
62+
.shutdown = gpuinfo_tpu_shutdown,
63+
.last_error_string = gpuinfo_tpu_last_error_string,
64+
.get_device_handles = gpuinfo_tpu_get_device_handles,
65+
.populate_static_info = gpuinfo_tpu_populate_static_info,
66+
.refresh_dynamic_info = gpuinfo_tpu_refresh_dynamic_info,
67+
.refresh_running_processes = gpuinfo_tpu_get_running_processes,
68+
.name = "TPU",
69+
};
70+
71+
__attribute__((constructor)) static void init_extract_gpuinfo_tpu(void) {
72+
register_gpu_vendor(&gpu_vendor_tpu);
73+
}
74+
75+
int64_t tpu_chip_count = -1;
76+
static struct gpu_info_tpu *gpu_infos;
77+
78+
#define STRINGIFY(x) STRINGIFY_HELPER_(x)
79+
#define STRINGIFY_HELPER_(x) #x
80+
81+
#define VENDOR_TPU 0x1ae0
82+
#define VENDOR_TPU_STR STRINGIFY(VENDOR_TPU)
83+
84+
#define MAX(x, y) ((x >= y) ? (x) : (y))
85+
#define MIN(x, y) ((x <= y) ? (x) : (y))
86+
87+
#define int64 long long
88+
89+
int (*_tpu_chip_count)(void);
90+
int (*_tpu_metrics)(int port, int64 *device_ids, int64 *memory_usage,
91+
int64 *total_memory, double *duty_cycle_pct, int n);
92+
int (*_tpu_pids)(int64 *pids, int n);
93+
94+
char *libname = "libtpuinfo.so";
95+
// -1 means allowing libtpuinfo to select the default port
96+
// env LIBTPUINFO_GRPC_PORT={int} allows setting the port via an environment variable
97+
// $ env LIBTPUINFO_GRPC_PORT=8431 nvtop
98+
int tpu_runtime_monitoring_port = -1;
99+
100+
/* TPU info cache ------------------------------------------------------------------------------- */
101+
struct tpu_chip_usage_data *latest_chips_usage_data = NULL;
102+
nvtop_time last_cache_refresh;
103+
int64 *_pids, *_device_ids, *_memory_usage, *_total_memory;
104+
double* _duty_cycle_pct;
105+
106+
bool is_cache_valid(void) {
107+
nvtop_time current_time;
108+
nvtop_get_current_time(&current_time);
109+
uint64_t t_diff_ns = nvtop_difftime_u64(last_cache_refresh, current_time);
110+
return t_diff_ns < 900 * 1000 * 1000; // 900ms
111+
}
112+
113+
bool refresh_tpu_cache(void) {
114+
if (is_cache_valid()) return true;
115+
nvtop_get_current_time(&last_cache_refresh);
116+
if (tpu_chip_count <= 0) return false;
117+
if (_tpu_pids(_pids, tpu_chip_count) != 0) {
118+
reset_tpu_cache(false);
119+
return false;
120+
}
121+
for (int64_t i = 0; i < tpu_chip_count; i++) latest_chips_usage_data[i].pid = _pids[i];
122+
123+
if (_tpu_metrics(tpu_runtime_monitoring_port, _device_ids, _memory_usage, _total_memory,
124+
_duty_cycle_pct, tpu_chip_count) != 0) return false;
125+
for (int64_t i = 0; i < tpu_chip_count; i++) {
126+
latest_chips_usage_data[i].device_id = _device_ids[i];
127+
latest_chips_usage_data[i].memory_usage = _memory_usage[i];
128+
latest_chips_usage_data[i].total_memory = _total_memory[i];
129+
latest_chips_usage_data[i].duty_cycle_pct = _duty_cycle_pct[i];
130+
}
131+
return true;
132+
}
133+
134+
void reset_tpu_cache(bool fully) {
135+
for (int64_t i = 0; i < tpu_chip_count; i++) {
136+
latest_chips_usage_data[i].memory_usage = 0;
137+
latest_chips_usage_data[i].duty_cycle_pct = 0;
138+
latest_chips_usage_data[i].pid = -1;
139+
if (fully) {
140+
snprintf(latest_chips_usage_data[i].name, sizeof(latest_chips_usage_data[i].name), "%s", "N/A");
141+
latest_chips_usage_data[i].device_id = 0;
142+
latest_chips_usage_data[i].total_memory = 0;
143+
}
144+
}
145+
}
146+
/* TPU info cache ------------------------------------------------------------------------------- */
147+
148+
bool gpuinfo_tpu_init(void) {
149+
char* error_msg;
150+
nvtop_get_current_time(&last_cache_refresh);
151+
// invalidate cache by putting it in the past
152+
last_cache_refresh = nvtop_substract_time(last_cache_refresh, (nvtop_time){10, 0});
153+
154+
// Load dynamic library symbols
155+
void *handle = dlopen(libname, RTLD_LAZY);
156+
if (!handle) {
157+
error_msg = dlerror();
158+
#ifndef NDEBUG
159+
if (error_msg != NULL) fprintf(stderr, "TPU support error: %s\n", error_msg);
160+
#endif
161+
return false;
162+
}
163+
164+
// Resolve the necessary symbols within the library
165+
_tpu_chip_count = dlsym(handle, "tpu_chip_count");
166+
error_msg = dlerror();
167+
if (error_msg != NULL) {
168+
#ifndef NDEBUG
169+
fprintf(stderr, "libtpuinfo can't resolve symbol `tpu_chip_count` with error: %s\n", error_msg);
170+
#endif
171+
return false;
172+
}
173+
_tpu_pids = dlsym(handle, "tpu_pids");
174+
error_msg = dlerror();
175+
if (error_msg != NULL) {
176+
#ifndef NDEBUG
177+
fprintf(stderr, "libtpuinfo can't resolve symbol `tpu_pids` with error: %s\n", error_msg);
178+
#endif
179+
return false;
180+
}
181+
_tpu_metrics = dlsym(handle, "tpu_metrics");
182+
error_msg = dlerror();
183+
if (error_msg != NULL) {
184+
#ifndef NDEBUG
185+
fprintf(stderr, "libtpuinfo can't resolve symbol `tpu_metrics` with error: %s\n", error_msg);
186+
#endif
187+
return false;
188+
}
189+
190+
// Discover TPU devices
191+
tpu_chip_count = _tpu_chip_count();
192+
if (tpu_chip_count == 0) {
193+
#ifndef NDEBUG
194+
fprintf(stderr, "Found 0 TPU devices on the system.\n");
195+
#endif
196+
return false;
197+
}
198+
199+
// Allocate memory for TPU device data cache
200+
latest_chips_usage_data = (struct tpu_chip_usage_data*)malloc(tpu_chip_count*sizeof(struct tpu_chip_usage_data));
201+
_pids = (int64*)malloc(sizeof(int64) * tpu_chip_count);
202+
_device_ids = (int64*)malloc(sizeof(int64) * tpu_chip_count);
203+
_memory_usage = (int64*)malloc(sizeof(int64) * tpu_chip_count);
204+
_total_memory = (int64*)malloc(sizeof(int64) * tpu_chip_count);
205+
_duty_cycle_pct = (double*)malloc(sizeof(double) * tpu_chip_count);
206+
reset_tpu_cache(true);
207+
return true;
208+
}
209+
210+
void free_ptr(void **ptr) {
211+
if (ptr != NULL && *ptr != NULL) {
212+
free(*ptr);
213+
*ptr = NULL;
214+
}
215+
}
216+
217+
void gpuinfo_tpu_shutdown(void) {
218+
free_ptr((void **)&gpu_infos);
219+
free_ptr((void **)&latest_chips_usage_data);
220+
free_ptr((void **)&_pids);
221+
free_ptr((void **)&_device_ids);
222+
free_ptr((void **)&_memory_usage);
223+
free_ptr((void **)&_total_memory);
224+
free_ptr((void **)&_duty_cycle_pct);
225+
tpu_chip_count = -1;
226+
}
227+
228+
const char *gpuinfo_tpu_last_error_string(void) { return "Err"; }
229+
230+
static void add_tpu_chip(struct list_head *devices, unsigned *count) {
231+
struct gpu_info_tpu *this_tpu = &gpu_infos[*count];
232+
this_tpu->base.vendor = &gpu_vendor_tpu;
233+
this_tpu->device_id = *count;
234+
snprintf(this_tpu->base.pdev, PDEV_LEN, "TPU%u", *count);
235+
list_add_tail(&this_tpu->base.list, devices);
236+
237+
this_tpu->base.processes_count = 0;
238+
this_tpu->base.processes = NULL;
239+
this_tpu->base.processes_array_size = 0;
240+
241+
*count = *count + 1;
242+
}
243+
244+
bool gpuinfo_tpu_get_device_handles(struct list_head *devices_list, unsigned *count) {
245+
*count = 0;
246+
if (tpu_chip_count <= 0) return false;
247+
gpu_infos = (struct gpu_info_tpu *)calloc(tpu_chip_count, sizeof(*gpu_infos));
248+
if (!gpu_infos) return false;
249+
for (int64_t i = 0; i < tpu_chip_count; i++) add_tpu_chip(devices_list, count);
250+
return true;
251+
}
252+
253+
void gpuinfo_tpu_populate_static_info(struct gpu_info *_gpu_info) {
254+
struct gpu_info_tpu *gpu_info = container_of(_gpu_info, struct gpu_info_tpu, base);
255+
struct gpuinfo_static_info *static_info = &gpu_info->base.static_info;
256+
static_info->integrated_graphics = false;
257+
static_info->encode_decode_shared = false;
258+
RESET_ALL(static_info->valid);
259+
snprintf(static_info->device_name, MIN(sizeof(static_info->device_name), PDEV_LEN), "%s", gpu_info->base.pdev);
260+
SET_VALID(gpuinfo_device_name_valid, static_info->valid);
261+
}
262+
263+
void gpuinfo_tpu_refresh_dynamic_info(struct gpu_info *_gpu_info) {
264+
struct gpu_info_tpu *gpu_info = container_of(_gpu_info, struct gpu_info_tpu, base);
265+
// struct gpuinfo_static_info *static_info = &gpu_info->base.static_info; // unused
266+
struct gpuinfo_dynamic_info *dynamic_info = &gpu_info->base.dynamic_info;
267+
268+
refresh_tpu_cache();
269+
270+
if (gpu_info->device_id >= tpu_chip_count) return;
271+
struct tpu_chip_usage_data usage_data = latest_chips_usage_data[gpu_info->device_id];
272+
double mem_util = round(1e2 * (double)(usage_data.memory_usage) / (double)MAX(1, usage_data.total_memory));
273+
double tpu_util = round(usage_data.duty_cycle_pct);
274+
SET_GPUINFO_DYNAMIC(dynamic_info, gpu_util_rate, (int)tpu_util);
275+
SET_GPUINFO_DYNAMIC(dynamic_info, mem_util_rate, (int)mem_util);
276+
SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, usage_data.total_memory);
277+
SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, usage_data.memory_usage);
278+
SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, usage_data.total_memory - usage_data.memory_usage);
279+
280+
return;
281+
}
282+
283+
void gpuinfo_tpu_get_running_processes(struct gpu_info *_gpu_info) {
284+
struct gpu_info_tpu *gpu_info = container_of(_gpu_info, struct gpu_info_tpu, base);
285+
if (gpu_info->device_id >= tpu_chip_count) return;
286+
if (tpu_chip_count <= 0 || latest_chips_usage_data[gpu_info->device_id].pid < 0) {
287+
_gpu_info->processes_count = 0;
288+
return;
289+
}
290+
_gpu_info->processes_count = 1;
291+
if (_gpu_info->processes_array_size == 0) {
292+
_gpu_info->processes_array_size = 1;
293+
_gpu_info->processes = (struct gpu_process*)malloc(1 * sizeof(struct gpu_process));
294+
memset(_gpu_info->processes, 0, _gpu_info->processes_count * sizeof(*_gpu_info->processes));
295+
}
296+
_gpu_info->processes[0].type = gpu_process_compute;
297+
_gpu_info->processes[0].pid = latest_chips_usage_data[gpu_info->device_id].pid;
298+
_gpu_info->processes[0].gpu_memory_usage = _gpu_info->dynamic_info.used_memory;
299+
300+
SET_VALID(gpuinfo_process_gpu_memory_usage_valid, _gpu_info->processes[0].valid);
301+
}

0 commit comments

Comments
 (0)