Skip to content

Commit bfb09df

Browse files
Merge branch 'pr-165' into fix/rate_limiter
2 parents c83fc4c + 9c1c6ff commit bfb09df

File tree

8 files changed

+84
-39
lines changed

8 files changed

+84
-39
lines changed

src/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ add_subdirectory(cuda)
1414
add_subdirectory(nvml)
1515

1616
set(LIBVGPU vgpu)
17-
add_library(${LIBVGPU} SHARED libvgpu.c utils.c $<TARGET_OBJECTS:nvml_mod> $<TARGET_OBJECTS:cuda_mod> $<TARGET_OBJECTS:allocator_mod> $<TARGET_OBJECTS:multiprocess_mod>)
17+
add_library(${LIBVGPU} SHARED libvgpu.c utils.c log_utils.c $<TARGET_OBJECTS:nvml_mod> $<TARGET_OBJECTS:cuda_mod> $<TARGET_OBJECTS:allocator_mod> $<TARGET_OBJECTS:multiprocess_mod>)
1818
target_compile_options(${LIBVGPU} PUBLIC ${LIBRARY_COMPILE_FLAGS})
1919
target_link_libraries(${LIBVGPU} PUBLIC -lcuda -lnvidia-ml)
2020

src/allocator/allocator.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,6 @@ size_t round_up(size_t size, size_t unit) {
3434
}
3535

3636
int oom_check(const int dev, size_t addon) {
37-
int count1=0;
38-
CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetCount,&count1);
3937
CUdevice d;
4038
if (dev==-1)
4139
cuCtxGetDevice(&d);

src/include/log_utils.h

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,33 +10,39 @@
1010

1111
extern FILE *fp1;
1212

13-
#ifdef FILEDEBUG
13+
/*
14+
* Cached log level — initialized once by log_utils_init().
15+
* Default is 2 (WARN/MSG/ERROR) to match original behavior when
16+
* LIBCUDA_LOG_LEVEL is unset.
17+
*
18+
* Levels: 0=off, 1=error-only, 2=warn(default), 3=info, 4=debug
19+
*/
20+
extern int g_log_level;
21+
22+
/* Call once during early initialization to cache LIBCUDA_LOG_LEVEL. */
23+
void log_utils_init(void);
24+
25+
#ifdef FILEDEBUG
1426
#define LOG_DEBUG(msg, ...) { \
15-
if ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=4)) {\
27+
if (g_log_level >= 4) {\
1628
if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
1729
fprintf(fp1, "[HAMI-core Debug(%d:%ld:%s:%d)]: "msg"\n",getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
1830
}\
1931
}
2032
#define LOG_INFO(msg, ...) { \
21-
if ( \
22-
/*(getenv("LIBCUDA_LOG_LEVEL")==NULL) || */\
23-
(getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=3)) {\
33+
if (g_log_level >= 3) {\
2434
if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
2535
fprintf(fp1, "[HAMI-core Info(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
2636
}\
2737
}
2838
#define LOG_WARN(msg, ...) { \
29-
if ( \
30-
(getenv("LIBCUDA_LOG_LEVEL")==NULL) || \
31-
((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\
39+
if (g_log_level >= 2) {\
3240
if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
3341
fprintf(fp1, "[HAMI-core Warn(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
3442
}\
3543
}
3644
#define LOG_MSG(msg, ...) { \
37-
if ( \
38-
(getenv("LIBCUDA_LOG_LEVEL")==NULL) || \
39-
((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\
45+
if (g_log_level >= 2) {\
4046
if (fp1==NULL) fp1 = fopen ("/tmp/vgpulog", "a"); \
4147
fprintf(fp1, "[HAMI-core Msg(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
4248
}\
@@ -47,27 +53,22 @@ extern FILE *fp1;
4753
}
4854
#else
4955
#define LOG_DEBUG(msg, ...) { \
50-
if ((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=4)) {\
56+
if (g_log_level >= 4) {\
5157
fprintf(stderr, "[HAMI-core Debug(%d:%ld:%s:%d)]: "msg"\n",getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
5258
}\
5359
}
5460
#define LOG_INFO(msg, ...) { \
55-
if ( \
56-
(getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=3)) {\
61+
if (g_log_level >= 3) {\
5762
fprintf(stderr, "[HAMI-core Info(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
5863
}\
5964
}
6065
#define LOG_WARN(msg, ...) { \
61-
if ( \
62-
(getenv("LIBCUDA_LOG_LEVEL")==NULL) || \
63-
((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\
66+
if (g_log_level >= 2) {\
6467
fprintf(stderr, "[HAMI-core Warn(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
6568
}\
6669
}
6770
#define LOG_MSG(msg, ...) { \
68-
if ( \
69-
(getenv("LIBCUDA_LOG_LEVEL")==NULL) || \
70-
((getenv("LIBCUDA_LOG_LEVEL")!=NULL) && (atoi(getenv("LIBCUDA_LOG_LEVEL"))>=2))) {\
71+
if (g_log_level >= 2) {\
7172
fprintf(stderr, "[HAMI-core Msg(%d:%ld:%s:%d)]: "msg"\n", getpid(),pthread_self(),basename(__FILE__),__LINE__,##__VA_ARGS__); \
7273
}\
7374
}
@@ -107,7 +108,7 @@ extern FILE *fp1;
107108
#define IF_CHECK_OOM(res) { \
108109
if (res < 0) \
109110
return CUDA_ERROR_OUT_OF_MEMORY; \
110-
}
111+
}
111112

112113

113114
#endif

src/libvgpu.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -853,6 +853,7 @@ void* __dlsym_hook_section_nvml(void* handle, const char* symbol) {
853853
}
854854

855855
void preInit(){
856+
log_utils_init();
856857
LOG_MSG("Initializing.....");
857858
if (real_dlsym == NULL) {
858859
real_dlsym = dlvsym(RTLD_NEXT,"dlsym","GLIBC_2.2.5");

src/log_utils.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
4+
/*
5+
* Cached log level, read once from LIBCUDA_LOG_LEVEL by log_utils_init().
6+
* Default 2 = warn/msg/error (matches original behavior when env is unset).
7+
*/
8+
int g_log_level = 2;
9+
10+
FILE *fp1 = NULL;
11+
12+
void log_utils_init(void) {
13+
const char *env = getenv("LIBCUDA_LOG_LEVEL");
14+
if (env != NULL) {
15+
g_log_level = atoi(env);
16+
}
17+
/* else: keep default of 2 (warn level) */
18+
}

src/multiprocess/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@ add_library(multiprocess_mod OBJECT multiprocess_memory_limit.c multiprocess_uti
33
target_compile_options(multiprocess_mod PUBLIC ${LIBRARY_COMPILE_FLAGS})
44
target_link_libraries(multiprocess_mod PUBLIC nvidia-ml)
55

6-
add_executable(shrreg-tool shrreg_tool.c)
6+
add_executable(shrreg-tool shrreg_tool.c ${CMAKE_CURRENT_SOURCE_DIR}/../log_utils.c)
77
target_link_libraries(shrreg-tool multiprocess_mod -lpthread -lcuda)
88

src/multiprocess/multiprocess_memory_limit.c

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -221,14 +221,24 @@ int active_oom_killer() {
221221
}
222222

223223
void pre_launch_kernel() {
224-
uint64_t now = time(NULL);
224+
struct timespec ts;
225+
clock_gettime(CLOCK_REALTIME_COARSE, &ts);
226+
uint64_t now = (uint64_t)ts.tv_sec;
227+
228+
// Fast path: skip mutex if within recording interval (double-checked)
229+
if (now - region_info.last_kernel_time < _record_kernel_interval) {
230+
return;
231+
}
232+
225233
pthread_mutex_lock(&_kernel_mutex);
234+
// Re-check under lock — another thread may have updated
226235
if (now - region_info.last_kernel_time < _record_kernel_interval) {
227236
pthread_mutex_unlock(&_kernel_mutex);
228237
return;
229238
}
230239
region_info.last_kernel_time = now;
231240
pthread_mutex_unlock(&_kernel_mutex);
241+
232242
LOG_INFO("write last kernel time: %ld", now)
233243
// Lock-free update using atomic compare-exchange
234244
uint64_t expected = atomic_load_explicit(&region_info.shared_region->last_kernel_time, memory_order_acquire);
@@ -1231,10 +1241,20 @@ void resume_all(){
12311241
}
12321242

12331243
int wait_status_self(int status){
1244+
// Fast path: use cached slot pointer (set during init_proc_slot_withlock)
1245+
if (region_info.my_slot != NULL) {
1246+
int32_t cur = atomic_load_explicit(&region_info.my_slot->status, memory_order_acquire);
1247+
return (cur == status) ? 1 : 0;
1248+
}
1249+
1250+
// Slow path: linear scan (only if my_slot not yet cached)
12341251
int i;
1235-
for (i=0;i<region_info.shared_region->proc_num;i++){
1236-
if (region_info.shared_region->procs[i].pid==getpid()){
1237-
if (region_info.shared_region->procs[i].status==status)
1252+
int proc_num = atomic_load_explicit(&region_info.shared_region->proc_num, memory_order_acquire);
1253+
int32_t my_pid = getpid();
1254+
for (i=0;i<proc_num;i++){
1255+
int32_t slot_pid = atomic_load_explicit(&region_info.shared_region->procs[i].pid, memory_order_acquire);
1256+
if (slot_pid==my_pid){
1257+
if (atomic_load_explicit(&region_info.shared_region->procs[i].status, memory_order_acquire)==status)
12381258
return 1;
12391259
else
12401260
return 0;

src/multiprocess/multiprocess_utilization_watcher.c

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,23 +31,29 @@ static volatile long g_total_cuda_cores = 0;
3131
extern int pidfound;
3232
int cuda_to_nvml_map_array[CUDA_DEVICE_MAX_COUNT];
3333

34+
/* Cached at init — these values do not change at runtime */
35+
static int cached_sm_limit = 0;
36+
static int cached_util_switch = 0;
37+
3438
void rate_limiter(int grids, int blocks) {
3539
long before_cuda_cores = 0;
3640
long after_cuda_cores = 0;
3741
long kernel_size = grids;
3842

43+
/* Fast exit using cached values — no shared memory access needed */
44+
if (cached_sm_limit >= 100 || cached_sm_limit == 0)
45+
return;
46+
if (cached_util_switch == 0)
47+
return;
48+
3949
while (get_recent_kernel()<0) {
4050
sleep(1);
4151
}
4252
set_recent_kernel(2);
43-
if ((get_current_device_sm_limit(0)>=100) || (get_current_device_sm_limit(0)==0))
44-
return;
45-
if (get_utilization_switch()==0)
46-
return;
53+
4754
LOG_DEBUG("grid: %d, blocks: %d", grids, blocks);
4855
LOG_DEBUG("launch kernel %ld, curr core: %ld", kernel_size, g_cur_cuda_cores);
49-
//if (g_vcuda_config.enable) {
50-
do {
56+
do {
5157
CHECK:
5258
before_cuda_cores = g_cur_cuda_cores;
5359
LOG_DEBUG("current core: %ld", g_cur_cuda_cores);
@@ -56,8 +62,7 @@ void rate_limiter(int grids, int blocks) {
5662
goto CHECK;
5763
}
5864
after_cuda_cores = before_cuda_cores - kernel_size;
59-
} while (!CAS(&g_cur_cuda_cores, before_cuda_cores, after_cuda_cores));
60-
//}
65+
} while (!CAS(&g_cur_cuda_cores, before_cuda_cores, after_cuda_cores));
6166
}
6267

6368
static void change_token(long delta) {
@@ -221,10 +226,12 @@ void* utilization_watcher() {
221226
}
222227

223228
void init_utilization_watcher() {
224-
LOG_INFO("set core utilization limit to %d",get_current_device_sm_limit(0));
229+
cached_sm_limit = get_current_device_sm_limit(0);
230+
cached_util_switch = get_utilization_switch();
231+
LOG_INFO("set core utilization limit to %d",cached_sm_limit);
225232
setspec();
226233
pthread_t tid;
227-
if ((get_current_device_sm_limit(0)<=100) && (get_current_device_sm_limit(0)>0)){
234+
if ((cached_sm_limit <= 100) && (cached_sm_limit > 0)){
228235
pthread_create(&tid, NULL, utilization_watcher, NULL);
229236
}
230237
return;

0 commit comments

Comments
 (0)