From 232e0e142de388f25865e9df6b5d77c728fe36b8 Mon Sep 17 00:00:00 2001 From: BretasArthur1 Date: Thu, 20 Nov 2025 00:55:44 -0300 Subject: [PATCH 1/4] feat: add support for LLVM IR files in the linker --- Cargo.lock | 1 + Cargo.toml | 1 + src/linker.rs | 66 ++++++++++++++++++++------- src/llvm/mod.rs | 49 ++++++++++++++++++-- tests/ir_file_test.rs | 103 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 201 insertions(+), 19 deletions(-) create mode 100644 tests/ir_file_test.rs diff --git a/Cargo.lock b/Cargo.lock index 61de611a..d9bef42c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -119,6 +119,7 @@ dependencies = [ "log", "regex", "rustc-build-sysroot", + "tempfile", "thiserror 2.0.17", "tracing", "tracing-appender", diff --git a/Cargo.toml b/Cargo.toml index 1a506ce9..aabd4238 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ compiletest_rs = { version = "0.11.0" } regex = { version = "1.11.1", default-features = false } rustc-build-sysroot = { workspace = true } which = { version = "8.0.0", default-features = false, features = ["real-sys", "regex"] } +tempfile = "3.13" [lints] workspace = true diff --git a/src/linker.rs b/src/linker.rs index faea046b..0d4a3822 100644 --- a/src/linker.rs +++ b/src/linker.rs @@ -206,6 +206,8 @@ enum InputType { MachO, /// Archive file. (.a) Archive, + /// IR file (.ll) + Ir, } impl std::fmt::Display for InputType { @@ -218,6 +220,7 @@ impl std::fmt::Display for InputType { Self::Elf => "elf", Self::MachO => "Mach-O", Self::Archive => "archive", + Self::Ir => "ir", } ) } @@ -508,7 +511,7 @@ where .ok_or(LinkerError::CreateModuleError)?; // buffer used to perform file type detection - let mut buf = [0u8; 8]; + let mut buf = [0u8; 1024]; for mut input in inputs { let path = match input { InputReader::File { path, .. } => path.into(), @@ -517,14 +520,14 @@ where // determine whether the input is bitcode, ELF with embedded bitcode, an archive file // or an invalid file - input - .read_exact(&mut buf) + let bytes_read = input + .read(&mut buf) .map_err(|e| LinkerError::IoError(path.clone(), e))?; input .rewind() .map_err(|e| LinkerError::IoError(path.clone(), e))?; - let in_type = - detect_input_type(&buf).ok_or_else(|| LinkerError::InvalidInputType(path.clone()))?; + let in_type = detect_input_type(&buf[..bytes_read]) + .ok_or_else(|| LinkerError::InvalidInputType(path.clone()))?; match in_type { InputType::Archive => { @@ -587,13 +590,29 @@ fn link_reader<'ctx>( .or_else(|| detect_input_type(&data)) .ok_or_else(|| LinkerError::InvalidInputType(path.to_owned()))?; - let bitcode = match in_type { - InputType::Bitcode => data, - InputType::Elf => match llvm::find_embedded_bitcode(context, &data) { - Ok(Some(bitcode)) => bitcode, - Ok(None) => return Err(LinkerError::MissingBitcodeSection(path.to_owned())), - Err(e) => return Err(LinkerError::EmbeddedBitcodeError(e)), - }, + match in_type { + InputType::Bitcode => { + if !llvm::link_bitcode_buffer(context, module, &data) { + return Err(LinkerError::LinkModuleError(path.to_owned())); + } + } + InputType::Ir => { + data.push(0); // force push null terminator + let data = CStr::from_bytes_with_nul(&data).unwrap(); + if !llvm::link_ir_buffer(context, module, data) { + return Err(LinkerError::LinkModuleError(path.to_owned())); + } + } + InputType::Elf => { + let bitcode = match llvm::find_embedded_bitcode(context, &data) { + Ok(Some(bitcode)) => bitcode, + Ok(None) => return Err(LinkerError::MissingBitcodeSection(path.to_owned())), + Err(e) => return Err(LinkerError::EmbeddedBitcodeError(e)), + }; + if !llvm::link_bitcode_buffer(context, module, &bitcode) { + return Err(LinkerError::LinkModuleError(path.to_owned())); + } + } // we need to handle this here since archive files could contain // mach-o files, eg somecrate.rlib containing lib.rmeta which is // mach-o on macos @@ -602,10 +621,6 @@ fn link_reader<'ctx>( InputType::Archive => panic!("nested archives not supported duh"), }; - if !llvm::link_bitcode_buffer(context, module, &bitcode) { - return Err(LinkerError::LinkModuleError(path.to_owned())); - } - Ok(()) } @@ -882,6 +897,8 @@ fn detect_input_type(data: &[u8]) -> Option { _ => { if &data[..8] == b"!\x0A" { Some(InputType::Archive) + } else if is_llvm_ir(data) { + Some(InputType::Ir) } else { None } @@ -889,6 +906,23 @@ fn detect_input_type(data: &[u8]) -> Option { } } +fn is_llvm_ir(data: &[u8]) -> bool { + // Trim whitespace from the start of the data + let trimmed = match data.iter().position(|b| !b.is_ascii_whitespace()) { + Some(position) => &data[position..], + None => return false, + }; + + // Checking for the presence of key keywords in the header + trimmed.starts_with(b"; ModuleID") + || trimmed.starts_with(b"target triple") + || trimmed.starts_with(b"target datalayout") + || trimmed.starts_with(b"source_filename") + || trimmed.starts_with(b"target ") + || trimmed.starts_with(b"define") + || trimmed.starts_with(b"!llvm") +} + pub struct LinkerOutput { inner: MemoryBuffer, } diff --git a/src/llvm/mod.rs b/src/llvm/mod.rs index 5e644084..4c1d549d 100644 --- a/src/llvm/mod.rs +++ b/src/llvm/mod.rs @@ -17,13 +17,14 @@ use llvm_sys::{ bit_reader::LLVMParseBitcodeInContext2, core::{ LLVMCreateMemoryBufferWithMemoryRange, LLVMDisposeMemoryBuffer, LLVMDisposeMessage, - LLVMGetEnumAttributeKindForName, LLVMGetMDString, LLVMGetModuleInlineAsm, LLVMGetTarget, - LLVMGetValueName2, LLVMRemoveEnumAttributeAtIndex, LLVMSetLinkage, LLVMSetModuleInlineAsm2, - LLVMSetVisibility, + LLVMDisposeModule, LLVMGetEnumAttributeKindForName, LLVMGetMDString, + LLVMGetModuleInlineAsm, LLVMGetTarget, LLVMGetValueName2, LLVMRemoveEnumAttributeAtIndex, + LLVMSetLinkage, LLVMSetModuleInlineAsm2, LLVMSetVisibility, }, error::{ LLVMDisposeErrorMessage, LLVMGetErrorMessage, LLVMGetErrorTypeId, LLVMGetStringErrorTypeId, }, + ir_reader::LLVMParseIRInContext, linker::LLVMLinkModules2, object::{ LLVMCreateBinary, LLVMDisposeBinary, LLVMDisposeSectionIterator, LLVMGetSectionContents, @@ -140,6 +141,48 @@ pub(crate) fn link_bitcode_buffer<'ctx>( linked } +#[must_use] +pub(crate) fn link_ir_buffer<'ctx>( + context: &'ctx LLVMContext, + module: &mut LLVMModule<'ctx>, + buffer: &CStr, +) -> bool { + let mut linked = false; + let buffer_name = c"ir_buffer"; + let buffer = buffer.to_bytes(); + let mem_buffer = unsafe { + LLVMCreateMemoryBufferWithMemoryRange( + buffer.as_ptr().cast(), + buffer.len(), + buffer_name.as_ptr(), + 1, + ) + }; + + let mut temp_module = ptr::null_mut(); + let mut error_msg = ptr::null_mut(); + + if unsafe { + LLVMParseIRInContext( + context.as_mut_ptr(), + mem_buffer, + &mut temp_module, + &mut error_msg, + ) + } == 0 + { + linked = unsafe { LLVMLinkModules2(module.as_mut_ptr(), temp_module) } == 0; + } else { + if !error_msg.is_null() { + unsafe { LLVMDisposeMessage(error_msg) }; + } + if !temp_module.is_null() { + unsafe { LLVMDisposeModule(temp_module) }; + } + } + + linked +} pub(crate) fn target_from_triple(triple: &CStr) -> Result { let mut target = ptr::null_mut(); diff --git a/tests/ir_file_test.rs b/tests/ir_file_test.rs new file mode 100644 index 00000000..e8b0783f --- /dev/null +++ b/tests/ir_file_test.rs @@ -0,0 +1,103 @@ +#![expect(unused_crate_dependencies, reason = "used in lib/bin")] + +use std::{ + env, fs, + path::{Path, PathBuf}, + process::Command, +}; + +fn linker_path() -> PathBuf { + PathBuf::from(env!("CARGO_BIN_EXE_bpf-linker")) +} + +fn create_test_ir_file(dir: &Path, name: &str) -> PathBuf { + let ir_path = dir.join(format!("{}.ll", name)); + let ir_content = format!( + r#"; ModuleID = '{name}' +source_filename = "{name}" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "bpf" + +define i32 @test_{name}(i32 %x) #0 {{ +entry: + %result = add i32 %x, 1 + ret i32 %result +}} + +attributes #0 = {{ noinline nounwind optnone }} + +!llvm.module.flags = !{{!0}} +!0 = !{{i32 1, !"wchar_size", i32 4}} +"# + ); + fs::write(&ir_path, ir_content).expect("Failed to write test IR file"); + ir_path +} + +#[test] +fn test_link_ir_file() { + let temp_dir = tempfile::tempdir().expect("Failed to create temp dir"); + let ir_file = create_test_ir_file(temp_dir.path(), "alessandro"); + let output_file = temp_dir.path().join("output.o"); + + let output = Command::new(linker_path()) + .arg("--export") + .arg(format!("test_{}", "alessandro")) + .arg(&ir_file) + .arg("-o") + .arg(&output_file) + .output() + .expect("Failed to execute bpf-linker"); + + if !output.status.success() { + eprintln!("stdout: {}", String::from_utf8_lossy(&output.stdout)); + eprintln!("stderr: {}", String::from_utf8_lossy(&output.stderr)); + panic!("bpf-linker failed with status: {}", output.status); + } + + assert!( + output_file.exists(), + "Output file should exist: {:?}", + output_file + ); + assert!( + output_file.metadata().unwrap().len() > 0, + "Output file should not be empty" + ); +} + +#[test] +fn test_invalid_ir_file() { + let temp_dir = tempfile::tempdir().expect("Failed to create temp dir"); + + let valid_ir_file = create_test_ir_file(temp_dir.path(), "alessandro"); + + let valid_content = fs::read_to_string(valid_ir_file).expect("Failed to read valid IR file"); + + // Corrupting IR content + let invalid_content = valid_content + .replace("define", "defXne") + .replace("add i32", "adX i32") + .replace("; ModuleID = 'alessandro'", ": ModuleXX = 'corrupted'"); + + let invalid_ir_file = temp_dir.path().join("corrupted.ll"); + + fs::write(&invalid_ir_file, invalid_content).expect("Failed to write invalid IR file"); + + let output_file = temp_dir.path().join("output.o"); + + let output = Command::new(linker_path()) + .arg(&invalid_ir_file) + .arg("-o") + .arg(&output_file) + .output() + .expect("Failed to execute bpf-linker"); + + // Should fail with corrupted IR + assert!( + !output.status.success(), + "bpf-linker should fail with corrupted IR. stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); +} + From bb104168a3363cecbc6e1922bfa4fc87dede64a1 Mon Sep 17 00:00:00 2001 From: BretasArthur1 Date: Fri, 21 Nov 2025 14:37:12 -0300 Subject: [PATCH 2/4] refactor: improve input handling in linker and update Cargo.toml dependencies --- Cargo.toml | 2 +- src/linker.rs | 58 ++++++++++++++++++++++++------------------- src/llvm/mod.rs | 36 +++++++++++---------------- tests/ir_file_test.rs | 1 - 4 files changed, 47 insertions(+), 50 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index aabd4238..3dea6514 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,8 +39,8 @@ tracing = "0.1" compiletest_rs = { version = "0.11.0" } regex = { version = "1.11.1", default-features = false } rustc-build-sysroot = { workspace = true } +tempfile = { version = "3.13" } which = { version = "8.0.0", default-features = false, features = ["real-sys", "regex"] } -tempfile = "3.13" [lints] workspace = true diff --git a/src/linker.rs b/src/linker.rs index 0d4a3822..2def42f3 100644 --- a/src/linker.rs +++ b/src/linker.rs @@ -3,7 +3,7 @@ use std::{ collections::HashSet, ffi::{CStr, CString, OsStr}, fs::File, - io::{self, Read, Seek}, + io::{self, BufRead, BufReader, Read, Seek}, ops::Deref, os::unix::ffi::OsStrExt as _, path::{Path, PathBuf}, @@ -510,24 +510,27 @@ where .create_module(c"linked_module") .ok_or(LinkerError::CreateModuleError)?; - // buffer used to perform file type detection - let mut buf = [0u8; 1024]; - for mut input in inputs { + for input in inputs { let path = match input { InputReader::File { path, .. } => path.into(), InputReader::Buffer { name, .. } => PathBuf::from(format!("in_memory::{}", name)), }; - // determine whether the input is bitcode, ELF with embedded bitcode, an archive file - // or an invalid file - let bytes_read = input - .read(&mut buf) + let mut buf = BufReader::new(input); + + // Peek at the buffer to determine file type + let preview = buf + .fill_buf() .map_err(|e| LinkerError::IoError(path.clone(), e))?; + + let in_type = detect_input_type(preview) + .ok_or_else(|| LinkerError::InvalidInputType(path.clone()))?; + + // Get back the inner reader to rewind it + let mut input = buf.into_inner(); input .rewind() .map_err(|e| LinkerError::IoError(path.clone(), e))?; - let in_type = detect_input_type(&buf[..bytes_read]) - .ok_or_else(|| LinkerError::InvalidInputType(path.clone()))?; match in_type { InputType::Archive => { @@ -597,9 +600,10 @@ fn link_reader<'ctx>( } } InputType::Ir => { - data.push(0); // force push null terminator - let data = CStr::from_bytes_with_nul(&data).unwrap(); - if !llvm::link_ir_buffer(context, module, data) { + let data = CString::new(data).unwrap(); + if !llvm::link_ir_buffer(context, module, &data) + .map_err(|_| LinkerError::LinkModuleError(path.to_owned()))? + { return Err(LinkerError::LinkModuleError(path.to_owned())); } } @@ -907,20 +911,22 @@ fn detect_input_type(data: &[u8]) -> Option { } fn is_llvm_ir(data: &[u8]) -> bool { - // Trim whitespace from the start of the data - let trimmed = match data.iter().position(|b| !b.is_ascii_whitespace()) { - Some(position) => &data[position..], - None => return false, - }; + let trimmed = data.trim_ascii_start(); + if trimmed.is_empty() { + return false; + } - // Checking for the presence of key keywords in the header - trimmed.starts_with(b"; ModuleID") - || trimmed.starts_with(b"target triple") - || trimmed.starts_with(b"target datalayout") - || trimmed.starts_with(b"source_filename") - || trimmed.starts_with(b"target ") - || trimmed.starts_with(b"define") - || trimmed.starts_with(b"!llvm") + let prefixes: &[&[u8]] = &[ + b"; ModuleID", + b"target triple", + b"target datalayout", + b"source_filename", + b"target ", + b"define", + b"!llvm", + ]; + + prefixes.iter().any(|prefix| trimmed.starts_with(prefix)) } pub struct LinkerOutput { diff --git a/src/llvm/mod.rs b/src/llvm/mod.rs index 4c1d549d..1d6cbdef 100644 --- a/src/llvm/mod.rs +++ b/src/llvm/mod.rs @@ -17,9 +17,9 @@ use llvm_sys::{ bit_reader::LLVMParseBitcodeInContext2, core::{ LLVMCreateMemoryBufferWithMemoryRange, LLVMDisposeMemoryBuffer, LLVMDisposeMessage, - LLVMDisposeModule, LLVMGetEnumAttributeKindForName, LLVMGetMDString, - LLVMGetModuleInlineAsm, LLVMGetTarget, LLVMGetValueName2, LLVMRemoveEnumAttributeAtIndex, - LLVMSetLinkage, LLVMSetModuleInlineAsm2, LLVMSetVisibility, + LLVMGetEnumAttributeKindForName, LLVMGetMDString, LLVMGetModuleInlineAsm, LLVMGetTarget, + LLVMGetValueName2, LLVMRemoveEnumAttributeAtIndex, LLVMSetLinkage, LLVMSetModuleInlineAsm2, + LLVMSetVisibility, }, error::{ LLVMDisposeErrorMessage, LLVMGetErrorMessage, LLVMGetErrorTypeId, LLVMGetStringErrorTypeId, @@ -141,13 +141,12 @@ pub(crate) fn link_bitcode_buffer<'ctx>( linked } -#[must_use] + pub(crate) fn link_ir_buffer<'ctx>( context: &'ctx LLVMContext, module: &mut LLVMModule<'ctx>, buffer: &CStr, -) -> bool { - let mut linked = false; +) -> Result { let buffer_name = c"ir_buffer"; let buffer = buffer.to_bytes(); let mem_buffer = unsafe { @@ -155,33 +154,26 @@ pub(crate) fn link_ir_buffer<'ctx>( buffer.as_ptr().cast(), buffer.len(), buffer_name.as_ptr(), - 1, + 1, // LLVM internally sets RequiresTerminator=true ) }; let mut temp_module = ptr::null_mut(); - let mut error_msg = ptr::null_mut(); - - if unsafe { + let (ret, message) = Message::with(|error_msg| unsafe { LLVMParseIRInContext( context.as_mut_ptr(), mem_buffer, &mut temp_module, - &mut error_msg, + error_msg, ) - } == 0 - { - linked = unsafe { LLVMLinkModules2(module.as_mut_ptr(), temp_module) } == 0; + }); + + if ret == 0 { + let linked = unsafe { LLVMLinkModules2(module.as_mut_ptr(), temp_module) } == 0; + Ok(linked) } else { - if !error_msg.is_null() { - unsafe { LLVMDisposeMessage(error_msg) }; - } - if !temp_module.is_null() { - unsafe { LLVMDisposeModule(temp_module) }; - } + Err(message.as_string_lossy().to_string()) } - - linked } pub(crate) fn target_from_triple(triple: &CStr) -> Result { diff --git a/tests/ir_file_test.rs b/tests/ir_file_test.rs index e8b0783f..043b6f4d 100644 --- a/tests/ir_file_test.rs +++ b/tests/ir_file_test.rs @@ -100,4 +100,3 @@ fn test_invalid_ir_file() { String::from_utf8_lossy(&output.stderr) ); } - From 61951534cae63510f2512906633ceee1c66d8203 Mon Sep 17 00:00:00 2001 From: BretasArthur1 Date: Fri, 21 Nov 2025 19:24:36 -0300 Subject: [PATCH 3/4] chore: remove unnecessary check --- src/linker.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/linker.rs b/src/linker.rs index 2def42f3..54469e85 100644 --- a/src/linker.rs +++ b/src/linker.rs @@ -912,9 +912,6 @@ fn detect_input_type(data: &[u8]) -> Option { fn is_llvm_ir(data: &[u8]) -> bool { let trimmed = data.trim_ascii_start(); - if trimmed.is_empty() { - return false; - } let prefixes: &[&[u8]] = &[ b"; ModuleID", From c8c6f9a7b31945d7bddd8a0840355a63524436f6 Mon Sep 17 00:00:00 2001 From: BretasArthur1 Date: Fri, 28 Nov 2025 00:09:49 -0300 Subject: [PATCH 4/4] refactor: change `detect_input_type` to recieve `InputType` as parameter and fix memory leak. remove unnecessary replacements on test --- src/linker.rs | 34 ++++++++++++++-------------------- src/llvm/mod.rs | 1 + tests/ir_file_test.rs | 6 ++---- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/src/linker.rs b/src/linker.rs index 54469e85..7dc42336 100644 --- a/src/linker.rs +++ b/src/linker.rs @@ -3,7 +3,7 @@ use std::{ collections::HashSet, ffi::{CStr, CString, OsStr}, fs::File, - io::{self, BufRead, BufReader, Read, Seek}, + io::{self, Read, Seek}, ops::Deref, os::unix::ffi::OsStrExt as _, path::{Path, PathBuf}, @@ -510,24 +510,15 @@ where .create_module(c"linked_module") .ok_or(LinkerError::CreateModuleError)?; - for input in inputs { - let path = match input { - InputReader::File { path, .. } => path.into(), + for mut input in inputs { + let path = match &input { + InputReader::File { path, .. } => (*path).into(), InputReader::Buffer { name, .. } => PathBuf::from(format!("in_memory::{}", name)), }; - let mut buf = BufReader::new(input); - - // Peek at the buffer to determine file type - let preview = buf - .fill_buf() - .map_err(|e| LinkerError::IoError(path.clone(), e))?; - - let in_type = detect_input_type(preview) + let in_type = detect_input_type(&mut input) .ok_or_else(|| LinkerError::InvalidInputType(path.clone()))?; - // Get back the inner reader to rewind it - let mut input = buf.into_inner(); input .rewind() .map_err(|e| LinkerError::IoError(path.clone(), e))?; @@ -590,7 +581,7 @@ fn link_reader<'ctx>( .map_err(|e| LinkerError::IoError(path.to_owned(), e))?; // in_type is unknown when we're linking an item from an archive file let in_type = in_type - .or_else(|| detect_input_type(&data)) + .or_else(|| detect_input_type(reader.by_ref())) .ok_or_else(|| LinkerError::InvalidInputType(path.to_owned()))?; match in_type { @@ -889,19 +880,22 @@ impl llvm::LLVMDiagnosticHandler for DiagnosticHandler { } } -fn detect_input_type(data: &[u8]) -> Option { - if data.len() < 8 { +fn detect_input_type(reader: &mut impl Read) -> Option { + let mut header = [0u8; 16]; + let bytes_read = reader.read(&mut header).ok()?; + + if bytes_read < 4 { return None; } - match &data[..4] { + match &header[..4] { b"\x42\x43\xC0\xDE" | b"\xDE\xC0\x17\x0b" => Some(InputType::Bitcode), b"\x7FELF" => Some(InputType::Elf), b"\xcf\xfa\xed\xfe" => Some(InputType::MachO), _ => { - if &data[..8] == b"!\x0A" { + if bytes_read >= 8 && &header[..8] == b"!\x0A" { Some(InputType::Archive) - } else if is_llvm_ir(data) { + } else if is_llvm_ir(&header[..bytes_read]) { Some(InputType::Ir) } else { None diff --git a/src/llvm/mod.rs b/src/llvm/mod.rs index 1d6cbdef..b84886c4 100644 --- a/src/llvm/mod.rs +++ b/src/llvm/mod.rs @@ -172,6 +172,7 @@ pub(crate) fn link_ir_buffer<'ctx>( let linked = unsafe { LLVMLinkModules2(module.as_mut_ptr(), temp_module) } == 0; Ok(linked) } else { + unsafe { LLVMDisposeMemoryBuffer(mem_buffer) }; Err(message.as_string_lossy().to_string()) } } diff --git a/tests/ir_file_test.rs b/tests/ir_file_test.rs index 043b6f4d..4de122a4 100644 --- a/tests/ir_file_test.rs +++ b/tests/ir_file_test.rs @@ -75,10 +75,8 @@ fn test_invalid_ir_file() { let valid_content = fs::read_to_string(valid_ir_file).expect("Failed to read valid IR file"); // Corrupting IR content - let invalid_content = valid_content - .replace("define", "defXne") - .replace("add i32", "adX i32") - .replace("; ModuleID = 'alessandro'", ": ModuleXX = 'corrupted'"); + let invalid_content = + valid_content.replace("; ModuleID = 'alessandro'", ": ModuleXX = 'corrupted'"); let invalid_ir_file = temp_dir.path().join("corrupted.ll");