diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc index c0960f5a5f9..f00b36600ab 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc @@ -29,11 +29,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include "rocm_smi/rocm_smi_io_link.h" #include "rocm_smi/rocm_smi_kfd.h" @@ -47,6 +49,7 @@ namespace amd::smi { static const char *kKFDProcPathRoot = "/sys/class/kfd/kfd/proc"; static const char *kKFDNodesPathRoot = "/sys/class/kfd/kfd/topology/nodes"; +static const char *kKFDContextPrefix = "context_"; // Prefix for secondary KFD contexts @@ -94,10 +97,48 @@ static const char *kKFDNodePropHIVE_IDStr = "hive_id"; // static const char *kKFDNodePropMAX_ENGINE_CLK_CCOMPUTEStr = // "max_engine_clk_ccompute"; +// KFD process file prefixes for extracting GPU IDs +static const char* kKFDStatsPrefix = "stats_"; +static const char* kKFDVramPrefix = "vram_"; +static const char* kKFDCountersPrefix = "counters_"; +static const char* kKFDSdmaPrefix = "sdma_"; + static bool is_number(const std::string &s) { return !s.empty() && std::all_of(s.begin(), s.end(), ::isdigit); } +// Helper function to get secondary context directories under a KFD process +// Returns a vector of full paths to context_xxxx directories +// For example: /sys/class/kfd/kfd/proc/1685/context_0 +static std::vector GetSecondaryContextPaths(const std::string& proc_path) noexcept { + std::vector context_paths; + + DIR* dir = opendir(proc_path.c_str()); + if (!dir) { + return context_paths; + } + + struct dirent* entry; + while ((entry = readdir(dir)) != nullptr) { + // Skip . and .. + if (entry->d_name[0] == '.') continue; + + // Check if the entry starts with "context_" + if (strncmp(entry->d_name, kKFDContextPrefix, strlen(kKFDContextPrefix)) == 0) { + std::string context_path = proc_path + "/" + entry->d_name; + // Verify it's a directory + struct stat st; + if (stat(context_path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)) { + context_paths.push_back(context_path); + } + } + } + + closedir(dir); + return context_paths; +} + + static std::string KFDDevicePath(uint32_t dev_id) { std::string node_path = kKFDNodesPathRoot; node_path += '/'; @@ -283,6 +324,9 @@ int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated, std::string proc_id_str; std::string tmp; + // Keep track of PIDs we've already seen to avoid duplicates + // (e.g., if both "1234" and "pid:1234-id:1" exist) + std::unordered_set seen_pids; while (dentry != nullptr) { if (dentry->d_name[0] == '.') { @@ -291,16 +335,42 @@ int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated, } proc_id_str = dentry->d_name; - assert(is_number(proc_id_str) && "Unexpected file name in kfd/proc dir"); - if (!is_number(proc_id_str)) { + + // Check if the entry is a plain number (traditional format) + if (is_number(proc_id_str)) { + uint32_t pid = static_cast(std::stoul(proc_id_str)); + if (seen_pids.find(pid) == seen_pids.end()) { + seen_pids.insert(pid); + if (procs && *num_procs_found < num_allocated) { + procs[*num_procs_found].process_id = pid; + } + ++(*num_procs_found); + } + } + // Check for "pid:XXXX-id:X" format (alternative format for multi-context processes) + else if (proc_id_str.find("pid:") == 0) { + // Extract PID from "pid:XXXX-id:X" format + size_t dash_pos = proc_id_str.find('-'); + if (dash_pos != std::string::npos) { + std::string pid_part = proc_id_str.substr(4, dash_pos - 4); // Extract XXXX from "pid:XXXX-id:X" + if (is_number(pid_part)) { + uint32_t pid = static_cast(std::stoul(pid_part)); + if (seen_pids.find(pid) == seen_pids.end()) { + seen_pids.insert(pid); + if (procs && *num_procs_found < num_allocated) { + procs[*num_procs_found].process_id = pid; + } + ++(*num_procs_found); + } + } + } + } + else { + // Skip unexpected entries that don't match known formats + // (e.g., non-numeric, non-pid: format files/directories) dentry = readdir(proc_dir); continue; } - if (procs && *num_procs_found < num_allocated) { - procs[*num_procs_found].process_id = - static_cast(std::stoi(proc_id_str)); - } - ++(*num_procs_found); dentry = readdir(proc_dir); } @@ -318,32 +388,72 @@ int GetKfdGpuIdsForPid(long pid, std::unordered_set* out){ out->clear(); std::string pdir = std::string(kKFDProcPathRoot) + "/" + std::to_string(pid); + + // Helper lambda to extract GPU IDs from files in a directory + auto extract_gpu_ids_from_dir = [&out](const std::string& dir_path) { + DIR* d = opendir(dir_path.c_str()); + if (!d) return; + + struct dirent* e; + while ((e = readdir(d))) { + if (e->d_name[0] == '.') continue; // skip "."/".." and hidden entries + + // Grab KFD GPU id from one of these fields + if (!strncmp(e->d_name, kKFDStatsPrefix, strlen(kKFDStatsPrefix))) { + out->insert(strtoull(e->d_name + strlen(kKFDStatsPrefix), nullptr, 10)); + } else if (!strncmp(e->d_name, kKFDVramPrefix, strlen(kKFDVramPrefix))) { + out->insert(strtoull(e->d_name + strlen(kKFDVramPrefix), nullptr, 10)); + } else if (!strncmp(e->d_name, kKFDCountersPrefix, strlen(kKFDCountersPrefix))) { + out->insert(strtoull(e->d_name + strlen(kKFDCountersPrefix), nullptr, 10)); + } else if (!strncmp(e->d_name, kKFDSdmaPrefix, strlen(kKFDSdmaPrefix))) { + out->insert(strtoull(e->d_name + strlen(kKFDSdmaPrefix), nullptr, 10)); + } + } + closedir(d); + }; + DIR* d = opendir(pdir.c_str()); if (!d) { perror(("Unable to open KFD process directory for process " + std::to_string(pid)).c_str()); return errno ? errno : ESRCH; } + closedir(d); - struct dirent* e; - - while ((e = readdir(d))) { + // Use the lambda for the primary process directory (instead of duplicating code) + extract_gpu_ids_from_dir(pdir); - if (e->d_name[0] == '.') continue; // skip "."/".." and hidden entries + // Also check secondary contexts (context_xxxx directories) + // These are created by the KFD multiple contexts feature + std::vector context_paths = GetSecondaryContextPaths(pdir); + for (const auto& context_path : context_paths) { + extract_gpu_ids_from_dir(context_path); + } - // Grab KFD GPU id from one of these fields - if (!strncmp(e->d_name, "stats_", 6)) { - out->insert(strtoull(e->d_name + 6, nullptr, 10)); - } else if (!strncmp(e->d_name, "vram_", 5)) { - out->insert(strtoull(e->d_name + 5, nullptr, 10)); - } else if (!strncmp(e->d_name, "counters_", 9)) { - out->insert(strtoull(e->d_name + 9, nullptr, 10)); - } else if (!strncmp(e->d_name, "sdma_", 5)) { - out->insert(strtoull(e->d_name + 5, nullptr, 10)); + // Also check for "pid:PID-id:X" format directories at the parent level + // This is another format used for multi-context processes + std::string pid_prefix = "pid:" + std::to_string(pid) + "-id:"; + DIR* proc_root = opendir(kKFDProcPathRoot); + if (proc_root) { + struct dirent* root_entry; + while ((root_entry = readdir(proc_root))) { + if (root_entry->d_name[0] == '.') continue; + std::string entry_name = root_entry->d_name; + if (entry_name.find(pid_prefix) == 0) { + // Found a pid:PID-id:X directory for this process + std::string alternate_path = std::string(kKFDProcPathRoot) + "/" + entry_name; + extract_gpu_ids_from_dir(alternate_path); + + // Also check for context_xxxx in this alternate path + std::vector alt_context_paths = GetSecondaryContextPaths(alternate_path); + for (const auto& alt_context_path : alt_context_paths) { + extract_gpu_ids_from_dir(alt_context_path); + } + } } + closedir(proc_root); } - closedir(d); return 0; } @@ -352,6 +462,7 @@ int GetKfdGpuIdsForPid(long pid, std::unordered_set* out){ // gpus_found. // Directory structure: // /sys/class/kfd/kfd/proc//queues//gpuid +// /sys/class/kfd/kfd/proc//context_/queues//gpuid (for secondary contexts) int GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_set) { int err; @@ -366,58 +477,103 @@ int GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_set) { return 0; } - std::string queues_dir = kKFDProcPathRoot; - queues_dir += "/"; - queues_dir += std::to_string(pid); - queues_dir += "/queues"; + std::string proc_path = std::string(kKFDProcPathRoot) + "/" + std::to_string(pid); - auto queues_dir_hd = opendir(queues_dir.c_str()); + // Helper lambda to read GPU IDs from queues in a given base path + auto read_gpus_from_queues = [&](const std::string& base_path) -> int { + std::string queues_dir = base_path + "/queues"; + auto queues_dir_hd = opendir(queues_dir.c_str()); - if (queues_dir_hd == nullptr) { - std::string err_str = "Unable to open queues directory for process "; - err_str += std::to_string(pid); - perror(err_str.c_str()); - return ESRCH; - } + if (queues_dir_hd == nullptr) { + // Directory doesn't exist, which is okay for secondary contexts + return 0; + } - auto q_dentry = readdir(queues_dir_hd); + auto q_dentry = readdir(queues_dir_hd); + std::string tmp; - std::string q_gpu_id_str; - std::string q_dir; + while (q_dentry != nullptr) { + if (q_dentry->d_name[0] == '.') { + q_dentry = readdir(queues_dir_hd); + continue; + } - std::string tmp; + if (!is_number(q_dentry->d_name)) { + q_dentry = readdir(queues_dir_hd); + continue; + } - while (q_dentry != nullptr) { - if (q_dentry->d_name[0] == '.') { - q_dentry = readdir(queues_dir_hd); - continue; - } + std::string q_gpu_id_str = queues_dir + '/' + q_dentry->d_name + "/gpuid"; - if (!is_number(q_dentry->d_name)) { - q_dentry = readdir(queues_dir_hd); - continue; - } + int read_err = ReadSysfsStr(q_gpu_id_str, &tmp); + if (read_err) { + q_dentry = readdir(queues_dir_hd); + continue; + } - q_gpu_id_str = queues_dir + '/' + q_dentry->d_name + "/gpuid"; + uint64_t val; + try { + val = static_cast(std::stoi(tmp)); + } catch (...) { + std::cerr << "Error; read invalid data: " << tmp << " from " << + q_gpu_id_str << std::endl; + closedir(queues_dir_hd); + return ENXIO; // Return "no such device" if we read an invalid gpu id + } + gpu_set->insert(val); - err = ReadSysfsStr(q_gpu_id_str, &tmp); - if (err) { q_dentry = readdir(queues_dir_hd); - continue; } - uint64_t val; - try { - val = static_cast(std::stoi(tmp)); - } catch (...) { - std::cerr << "Error; read invalid data: " << tmp << " from " << - q_gpu_id_str << std::endl; - closedir(queues_dir_hd); - return ENXIO; // Return "no such device" if we read an invalid gpu id + closedir(queues_dir_hd); + return 0; + }; + + // Read from primary process queues + err = read_gpus_from_queues(proc_path); + if (err != 0 && err != ESRCH) { + return err; + } + + // Read from secondary context queues + std::vector context_paths = GetSecondaryContextPaths(proc_path); + for (const auto& context_path : context_paths) { + err = read_gpus_from_queues(context_path); + if (err != 0 && err != ESRCH) { + return err; } - gpu_set->insert(val); + } + + // Also check for "pid:PID-id:X" format directories at the parent level + // This is another format used for multi-context processes + std::string pid_prefix = "pid:" + std::to_string(pid) + "-id:"; + DIR* proc_root = opendir(kKFDProcPathRoot); + if (proc_root) { + struct dirent* root_entry; + while ((root_entry = readdir(proc_root))) { + if (root_entry->d_name[0] == '.') continue; + std::string entry_name = root_entry->d_name; + if (entry_name.find(pid_prefix) == 0) { + // Found a pid:PID-id:X directory for this process + std::string alternate_path = std::string(kKFDProcPathRoot) + "/" + entry_name; + err = read_gpus_from_queues(alternate_path); + if (err != 0 && err != ESRCH) { + closedir(proc_root); + return err; + } - q_dentry = readdir(queues_dir_hd); + // Also check for context_xxxx in this alternate path + std::vector alt_context_paths = GetSecondaryContextPaths(alternate_path); + for (const auto& alt_context_path : alt_context_paths) { + err = read_gpus_from_queues(alt_context_path); + if (err != 0 && err != ESRCH) { + closedir(proc_root); + return err; + } + } + } + } + closedir(proc_root); } // if no queues were present, fallback to grab KFD GPU IDs from parent dir names @@ -426,10 +582,6 @@ int GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_set) { return kfd_ret; } - errno = 0; - if (closedir(queues_dir_hd)) { - return errno; - } return 0; } @@ -471,9 +623,7 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, std::unordered_set::iterator itr; uint32_t kfd_stat; - std::string proc_str_path = kKFDProcPathRoot; - proc_str_path += "/"; - proc_str_path += std::to_string(pid); + std::string proc_str_path = std::string(kKFDProcPathRoot) + "/" + std::to_string(pid); if (!FileExists(proc_str_path.c_str())) { return ESRCH; @@ -485,64 +635,115 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, proc->cu_occupancy = 0; proc->evicted_time = 0; - for (itr = gpu_set->begin(); itr != gpu_set->end(); itr++) { - uint64_t gpu_id = (*itr); - - std::string vram_str_path = proc_str_path; - vram_str_path += "/vram_"; - vram_str_path += std::to_string(gpu_id); + // Collect all paths to read metrics from: primary process + secondary contexts + std::vector metric_paths; + metric_paths.push_back(proc_str_path); - err = ReadSysfsStr(vram_str_path, &tmp); - auto sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); + // Add secondary context paths (context_xxxx directories) + // These are created by the KFD multiple contexts feature + std::vector context_paths = GetSecondaryContextPaths(proc_str_path); + for (const auto& context_path : context_paths) { + metric_paths.push_back(context_path); + } - // Report all errors, except ENOENT (2), which should be ignored - // and the proc->vram_usage should be unmodified - if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ - return sysfs_data_errcode; - } - // Do not store any invalid values - else if (sysfs_data_errcode == 0) { - proc->vram_usage += std::stoull(tmp); + // Also check for "pid:PID-id:X" format directories at the parent level + // This is another format used for multi-context processes + std::string pid_prefix = "pid:" + std::to_string(pid) + "-id:"; + DIR* proc_root = opendir(kKFDProcPathRoot); + if (proc_root) { + struct dirent* root_entry; + while ((root_entry = readdir(proc_root))) { + if (root_entry->d_name[0] == '.') continue; + std::string entry_name = root_entry->d_name; + if (entry_name.find(pid_prefix) == 0) { + // Found a pid:PID-id:X directory for this process + std::string alternate_path = std::string(kKFDProcPathRoot) + "/" + entry_name; + metric_paths.push_back(alternate_path); + + // Also check for context_xxxx in this alternate path + std::vector alt_context_paths = GetSecondaryContextPaths(alternate_path); + for (const auto& alt_context_path : alt_context_paths) { + metric_paths.push_back(alt_context_path); + } + } } + closedir(proc_root); + } - std::string sdma_str_path = proc_str_path; - sdma_str_path += "/sdma_"; - sdma_str_path += std::to_string(gpu_id); + for (const auto& gpu_id : *gpu_set) { - err = ReadSysfsStr(sdma_str_path, &tmp); - sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); + // Aggregate metrics from primary process and all secondary contexts + for (const auto& metric_base_path : metric_paths) { + std::string vram_str_path = metric_base_path + "/vram_" + std::to_string(gpu_id); - if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ - return sysfs_data_errcode; - } - else if (sysfs_data_errcode == 0) { - proc->sdma_usage += std::stoull(tmp); - } + err = ReadSysfsStr(vram_str_path, &tmp); + auto sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); - // Build the path and read from Sysfs file, info that - // encodes Compute Unit usage by a process of interest - std::string cu_occupancy_path = proc_str_path; - cu_occupancy_path += "/stats_"; - cu_occupancy_path += std::to_string(gpu_id); - cu_occupancy_path += "/cu_occupancy"; + // Report all errors, except ENOENT (2), which should be ignored + // and the proc->vram_usage should be unmodified + if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ + return sysfs_data_errcode; + } + // Do not store any invalid values + else if (sysfs_data_errcode == 0) { + proc->vram_usage += std::stoull(tmp); + } - err = GetProcessKFDStats(cu_occupancy_path, kfd_stat); - if (err != 0){ - return err; - } - proc->cu_occupancy = kfd_stat; + std::string sdma_str_path = metric_base_path + "/sdma_" + std::to_string(gpu_id); - std::string evicted_time_path = proc_str_path; - evicted_time_path += "/stats_"; - evicted_time_path += std::to_string(gpu_id); - evicted_time_path += "/evicted_ms"; + err = ReadSysfsStr(sdma_str_path, &tmp); + sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); - err = GetProcessKFDStats(evicted_time_path, kfd_stat); - if (err != 0){ - return err; - } - proc->evicted_time = kfd_stat; + if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ + return sysfs_data_errcode; + } + else if (sysfs_data_errcode == 0) { + proc->sdma_usage += std::stoull(tmp); + } + + // Build the path and read from Sysfs file, info that + // encodes Compute Unit usage by a process of interest + std::string cu_occupancy_path = metric_base_path + "/stats_" + std::to_string(gpu_id) + "/cu_occupancy"; + + err = GetProcessKFDStats(cu_occupancy_path, kfd_stat); + if (err != 0) { + // ENOENT is acceptable for secondary contexts where stats may not exist + // Only return error for: non-ENOENT errors, OR primary process with existing file + bool is_primary = (metric_base_path == proc_str_path); + bool file_exists = FileExists(cu_occupancy_path.c_str()); + if (err != ENOENT || (is_primary && file_exists)) { + return err; + } + } else { + // Aggregate cu_occupancy (use max value as it represents peak usage) + if (kfd_stat != KFD_STATS_INVALID && kfd_stat > proc->cu_occupancy) { + proc->cu_occupancy = kfd_stat; + } + } + std::string evicted_time_path = metric_base_path + "/stats_" + std::to_string(gpu_id) + "/evicted_ms"; + + err = GetProcessKFDStats(evicted_time_path, kfd_stat); + if (err != 0) { + // ENOENT is acceptable for secondary contexts where stats may not exist + // Only return error for: non-ENOENT errors, OR primary process with existing file + bool is_primary_ctx = (metric_base_path == proc_str_path); + bool file_found = FileExists(evicted_time_path.c_str()); + if (err != ENOENT || (is_primary_ctx && file_found)) { + return err; + } + } else { + // Aggregate evicted_time (sum all evicted times) + if (kfd_stat != KFD_STATS_INVALID) { + // Handle potential overflow by checking before addition + if (proc->evicted_time <= UINT32_MAX - kfd_stat) { + proc->evicted_time += kfd_stat; + } else { + proc->evicted_time = UINT32_MAX; // Cap at max value + } + } + } + } // End of metric_paths loop } return 0; diff --git a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc index eb1e183a4b7..c1dd8ae850b 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc @@ -20,6 +20,8 @@ * THE SOFTWARE. */ +#include +#include #include #include #include @@ -33,6 +35,9 @@ namespace amd::smi { +// Constant for KFD context directory prefix +static constexpr const char* kContextPrefix = "context_"; + uint32_t AMDSmiGPUDevice::get_gpu_id() const { return gpu_id_; } @@ -184,28 +189,90 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t& // Safely handle KFD processes to get total memory_usage of the process uint64_t kfd_gpu_id = get_kfd_gpu_id(); - std::string kfd_path = "/sys/class/kfd/kfd/proc/" + - std::to_string(rsmi_proc_info.process_id) + - "/vram_" + std::to_string(kfd_gpu_id); - - // Check if the file exists before attempting to open it - if (access(kfd_path.c_str(), R_OK) == 0) { - std::ifstream kfd_file(kfd_path.c_str()); - if (kfd_file.is_open()) { - std::string line; - if (std::getline(kfd_file, line)) { - try { - uint64_t vram_bytes = std::stoull(line); - amdsmi_proc_info.mem = vram_bytes; // Already in bytes - } catch (const std::exception& e) { - // Handle conversion error gracefully - std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << " | Failed to parse VRAM value from KFD: " << e.what(); - LOG_DEBUG(ss); + std::string kfd_proc_path = "/sys/class/kfd/kfd/proc/" + + std::to_string(rsmi_proc_info.process_id); + std::string kfd_vram_file = "/vram_" + std::to_string(kfd_gpu_id); + + // Helper for safe addition without overflow + auto safe_add = [](uint64_t a, uint64_t b) -> uint64_t { + return (a > UINT64_MAX - b) ? UINT64_MAX : a + b; + }; + // Helper lambda to read VRAM from a path. + // Returns 0 if file doesn't exist or can't be read (intentional for optional paths). + // Logs parse errors via LOG_INFO but doesn't propagate them - this is a best-effort + // aggregation where partial data is better than failing the entire operation. + auto read_vram_from_path = [&kfd_vram_file](const std::string& base_path) -> uint64_t { + uint64_t vram_bytes = 0; + std::string vram_path = base_path + kfd_vram_file; + + // File may not exist for secondary contexts - this is expected, not an error + if (access(vram_path.c_str(), R_OK) != 0) { + return 0; // File doesn't exist or not readable - expected for optional paths + } + + std::ifstream kfd_file(vram_path); + if (!kfd_file.is_open()) { + return 0; // Couldn't open file - treat as no data available + } + + std::string line; + if (std::getline(kfd_file, line)) { + try { + vram_bytes = std::stoull(line); + } catch (const std::exception& e) { + // Parse error is unexpected - log it for debugging + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | Failed to parse VRAM value from KFD: " << e.what(); + LOG_INFO(ss); + // Return 0 rather than failing - best effort aggregation + } + } + kfd_file.close(); + return vram_bytes; + }; + + // Helper lambda to read VRAM from all contexts in a directory + auto read_vram_from_all_contexts = [&read_vram_from_path, &safe_add](const std::string& base_path) -> uint64_t { + uint64_t total = read_vram_from_path(base_path); + + // Check for secondary contexts (context_xxxx directories) + DIR* dir = opendir(base_path.c_str()); + if (dir != nullptr) { + struct dirent* entry; + while ((entry = readdir(dir)) != nullptr) { + if (strncmp(entry->d_name, kContextPrefix, strlen(kContextPrefix)) == 0) { + std::string context_path = base_path + "/" + entry->d_name; + total = safe_add(total, read_vram_from_path(context_path)); } } - kfd_file.close(); + closedir(dir); + } + return total; + }; + + // Read VRAM from primary process + uint64_t total_vram = read_vram_from_all_contexts(kfd_proc_path); + + // Also check for "pid:PID-id:X" format directories at the parent level + // This is another format used for multi-context processes + std::string kfd_root = "/sys/class/kfd/kfd/proc/"; + std::string pid_prefix = "pid:" + std::to_string(rsmi_proc_info.process_id) + "-id:"; + DIR* proc_root = opendir(kfd_root.c_str()); + if (proc_root != nullptr) { + struct dirent* root_entry; + while ((root_entry = readdir(proc_root)) != nullptr) { + if (root_entry->d_name[0] == '.') continue; + std::string entry_name = root_entry->d_name; + if (entry_name.find(pid_prefix) == 0) { + std::string alternate_path = kfd_root + entry_name; + total_vram = safe_add(total_vram, read_vram_from_all_contexts(alternate_path)); + } } + closedir(proc_root); + } + + if (total_vram > 0) { + amdsmi_proc_info.mem = total_vram; } return status_code;