Skip to content

Commit c3ef08a

Browse files
committed
Fix incorrect file offset calculation in memory mapping
The current implementation incorrectly assumes that calculating a file offset from a process memory address can be done using a simple subtraction from the library's load address. This assumption doesn't hold for binaries with non-standard ELF layouts, where PT_LOAD segments may have different virtual address to file offset mappings. Fix the issue by: 1. First converting the absolute process address to a library-relative offset by subtracting the library's load point in the process 2. Finding the PT_LOAD segment in the ELF file that contains this offset 3. Using the segment's p_vaddr and p_offset to calculate the correct file offset To avoid performance penalties from repeatedly parsing ELF files, add caching of PT_LOAD segments per library. Example of what was wrong: old: file_offset = addr - lib_start new: file_offset = ((addr - lib_start) - segment->p_vaddr) + segment->p_offset This fixes an issue where pystack would read from incorrect file offsets when analyzing binaries compiled with non-standard layout options (e.g., when using the gold linker with custom flags). Signed-off-by: Pablo Galindo <[email protected]>
1 parent d4333e1 commit c3ef08a

File tree

4 files changed

+114
-4
lines changed

4 files changed

+114
-4
lines changed

news/220.bugfix.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix incorrect file offset calculation when analyzing ELF files with
2+
non-standard ELF layouts. Previously, pystack would fail to correctly analyze
3+
Python binaries that had non-standard ELF layouts (for example when compiled
4+
with certain linker options). The fix properly accounts for PT_LOAD segment
5+
mappings when calculating file offsets.

src/pystack/_pystack/mem.cpp

Lines changed: 82 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616

1717
namespace pystack {
1818

19+
using elf_unique_ptr = std::unique_ptr<Elf, std::function<void(Elf*)>>;
20+
using file_unique_ptr = std::unique_ptr<FILE, std::function<int(FILE*)>>;
21+
1922
static ssize_t
2023
_process_vm_readv(
2124
pid_t pid,
@@ -398,17 +401,62 @@ CorefileRemoteMemoryManager::StatusCode
398401
CorefileRemoteMemoryManager::getMemoryLocationFromCore(remote_addr_t addr, off_t* offset_in_file) const
399402
{
400403
auto corefile_it = std::find_if(d_vmaps.cbegin(), d_vmaps.cend(), [&](auto& map) {
401-
return (map.Start() <= addr && addr < map.End()) && (map.FileSize() != 0 && map.Offset() != 0);
404+
// When considering if the data is in the core file, we need to check if the address is
405+
// within the chunk of the segment in the core file. map.End() corresponds
406+
// to the end of the segment in memory when the process was alive but when the core was
407+
// created not all that data will be in the core, so we need to use map.FileSize()
408+
// to get the end of the segment in the core file.
409+
uintptr_t fileEnd = map.Start() + map.FileSize();
410+
return (map.Start() <= addr && addr < fileEnd) && (map.FileSize() != 0 && map.Offset() != 0);
402411
});
403412
if (corefile_it == d_vmaps.cend()) {
404413
return StatusCode::ERROR;
405414
}
406415

407-
unsigned long base = corefile_it->Offset() - corefile_it->Start();
416+
off_t base = corefile_it->Offset() - corefile_it->Start();
408417
*offset_in_file = base + addr;
409418
return StatusCode::SUCCESS;
410419
}
411420

421+
CorefileRemoteMemoryManager::StatusCode
422+
CorefileRemoteMemoryManager::initLoadSegments(const std::string& filename) const
423+
{
424+
file_unique_ptr file(fopen(filename.c_str(), "r"), fclose);
425+
if (!file || fileno(file.get()) == -1) {
426+
return StatusCode::ERROR;
427+
}
428+
429+
auto elf = elf_unique_ptr(elf_begin(fileno(file.get()), ELF_C_READ_MMAP, nullptr), elf_end);
430+
if (!elf) {
431+
return StatusCode::ERROR;
432+
}
433+
434+
std::vector<ElfLoadSegment> segments;
435+
size_t phnum;
436+
if (elf_getphdrnum(elf.get(), &phnum) == 0) {
437+
for (size_t i = 0; i < phnum; i++) {
438+
GElf_Phdr phdr_mem;
439+
GElf_Phdr* phdr = gelf_getphdr(elf.get(), i, &phdr_mem);
440+
if (phdr == nullptr) {
441+
LOG(WARNING) << "Failed to read program header " << i << " from " << filename.c_str()
442+
<< " (" << elf_errmsg(elf_errno()) << ")";
443+
continue;
444+
}
445+
446+
if (phdr->p_type == PT_LOAD) {
447+
segments.push_back(
448+
{.vaddr = phdr->p_vaddr, .offset = phdr->p_offset, .size = phdr->p_filesz});
449+
}
450+
}
451+
}
452+
453+
if (!segments.empty()) {
454+
d_elf_load_segments_cache[filename] = std::move(segments);
455+
return StatusCode::SUCCESS;
456+
}
457+
return StatusCode::ERROR;
458+
}
459+
412460
CorefileRemoteMemoryManager::StatusCode
413461
CorefileRemoteMemoryManager::getMemoryLocationFromElf(
414462
remote_addr_t addr,
@@ -418,12 +466,42 @@ CorefileRemoteMemoryManager::getMemoryLocationFromElf(
418466
auto shared_libs_it = std::find_if(d_shared_libs.cbegin(), d_shared_libs.cend(), [&](auto& map) {
419467
return map.start <= addr && addr <= map.end;
420468
});
469+
421470
if (shared_libs_it == d_shared_libs.cend()) {
422471
return StatusCode::ERROR;
423472
}
473+
424474
*filename = &shared_libs_it->filename;
425-
*offset_in_file = addr - shared_libs_it->start;
426-
return StatusCode::SUCCESS;
475+
476+
// Check if we have cached segments for this file
477+
auto cache_it = d_elf_load_segments_cache.find(**filename);
478+
if (cache_it == d_elf_load_segments_cache.end()) {
479+
// Initialize segments if not in cache
480+
if (initLoadSegments(**filename) != StatusCode::SUCCESS) {
481+
return StatusCode::ERROR;
482+
}
483+
cache_it = d_elf_load_segments_cache.find(**filename);
484+
}
485+
486+
// Get the load address of the elf file from its first segment
487+
remote_addr_t elf_load_addr = cache_it->second[0].vaddr;
488+
489+
// Now relocate the address to the elf file
490+
remote_addr_t symbol_vaddr = addr - shared_libs_it->start + elf_load_addr;
491+
492+
// Find the segment containing this address
493+
for (const auto& segment : cache_it->second) {
494+
if (symbol_vaddr >= segment.vaddr && symbol_vaddr < segment.vaddr + segment.size) {
495+
*offset_in_file = (symbol_vaddr - segment.vaddr) + segment.offset;
496+
return StatusCode::SUCCESS;
497+
}
498+
}
499+
500+
LOG(ERROR) << "Failed to find the correct segment for address " << std::hex << std::showbase << addr
501+
<< " (with vaddr offset " << symbol_vaddr << ")"
502+
<< " in file " << **filename;
503+
504+
return StatusCode::ERROR;
427505
}
428506

429507
bool

src/pystack/_pystack/mem.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,15 @@ class CorefileRemoteMemoryManager : public AbstractRemoteMemoryManager
207207
ERROR,
208208
};
209209

210+
struct ElfLoadSegment
211+
{
212+
GElf_Addr vaddr;
213+
GElf_Off offset;
214+
GElf_Xword size;
215+
};
216+
// Cache for PT_LOAD segments
217+
mutable std::unordered_map<std::string, std::vector<ElfLoadSegment>> d_elf_load_segments_cache;
218+
210219
// Data members
211220
std::shared_ptr<CoreFileAnalyzer> d_analyzer;
212221
std::vector<VirtualMap> d_vmaps;
@@ -220,5 +229,6 @@ class CorefileRemoteMemoryManager : public AbstractRemoteMemoryManager
220229
remote_addr_t addr,
221230
const std::string** filename,
222231
off_t* offset_in_file) const;
232+
StatusCode initLoadSegments(const std::string& filename) const;
223233
};
224234
} // namespace pystack

src/pystack/_pystack/process.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -750,6 +750,23 @@ AbstractProcessManager::findPythonVersion() const
750750
}
751751
int major = (version >> 24) & 0xFF;
752752
int minor = (version >> 16) & 0xFF;
753+
int level = (version >> 4) & 0x0F;
754+
755+
if (major == 0 && minor == 0) {
756+
LOG(DEBUG) << "Failed to determine Python version from symbols: empty data copied";
757+
return {-1, -1};
758+
}
759+
760+
if (major != 2 && major != 3) {
761+
LOG(DEBUG) << "Failed to determine Python version from symbols: invalid major version";
762+
return {-1, -1};
763+
}
764+
765+
if (level != 0xA && level != 0xB && level != 0xC && level != 0xF) {
766+
LOG(DEBUG) << "Failed to determine Python version from symbols: invalid release level";
767+
return {-1, -1};
768+
}
769+
753770
LOG(DEBUG) << "Python version determined from symbols: " << major << "." << minor;
754771
return {major, minor};
755772
}

0 commit comments

Comments
 (0)