Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
**/tests/
/build
*.pyc
**/__pycache__
*.egg-info/
.vagrant
**/compile_commands.json
.python-version
Expand Down
21 changes: 18 additions & 3 deletions c2rust-ast-exporter/src/AstExporter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -830,9 +830,13 @@ class TranslateASTVisitor final
Decl *ast, ASTEntryTag tag, const std::vector<void *> &childIds,
const QualType T,
std::function<void(CborEncoder *)> extra = [](CborEncoder *) {}) {
auto &Mgr = Context->getSourceManager();
auto rvalue = false;
auto encodeMacroExpansions = false;
encode_entry_raw(ast, tag, ast->getSourceRange(), T, rvalue,
auto charRange = clang::CharSourceRange::getCharRange(ast->getSourceRange());
// Extend the range to include the entire final token.
charRange.setEnd(clang::Lexer::getLocForEndOfToken(ast->getSourceRange().getEnd(), 0, Mgr, Context->getLangOpts()));
encode_entry_raw(ast, tag, charRange.getAsRange(), T, rvalue,
isVaList(ast, T), encodeMacroExpansions, childIds, extra);
}

Expand Down Expand Up @@ -1998,7 +2002,13 @@ class TranslateASTVisitor final

// We prefer non-implicit decls for their type information.
auto functionType = paramsFD->getType();
auto span = paramsFD->getSourceRange();
auto &Mgr = Context->getSourceManager();
auto rvalue = false;
auto encodeMacroExpansions = false;
auto charRange = clang::CharSourceRange::getCharRange(FD->getSourceRange());
// Extend the range to include the entire final token.
charRange.setEnd(clang::Lexer::getLocForEndOfToken(FD->getSourceRange().getEnd(), 0, Mgr, Context->getLangOpts()));
auto span = charRange.getAsRange();
encode_entry(
FD, TagFunctionDecl, span, childIds, functionType,
[this, FD](CborEncoder *array) {
Expand Down Expand Up @@ -2119,7 +2129,12 @@ class TranslateASTVisitor final
// type
auto T = def->getType();

auto loc = is_defn ? def->getLocation() : VD->getLocation();
auto loc = is_defn ? def->getSourceRange() : VD->getSourceRange();
auto &Mgr = Context->getSourceManager();
auto charRange = clang::CharSourceRange::getCharRange(loc);
// Extend the range to include the entire final token.
charRange.setEnd(clang::Lexer::getLocForEndOfToken(loc.getEnd(), 0, Mgr, Context->getLangOpts()));
loc = charRange.getAsRange();

encode_entry(
VD, TagVarDecl, loc, childIds, T,
Expand Down
43 changes: 43 additions & 0 deletions c2rust-postprocess/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# LLM-based postprocessing of c2rust transpiler output

This is currently a prototype effort to gauge the extent to which LLMs can
accelerate the types of translation and migration that help move C code to Rust.

# Prerequisites

- Python 3.12 or later
- `uv` in path
- A valid `GEMINI_API_KEY` set
- A transpiled codebase with a correct `compile_commands.json`

# Running

- `c2rust-postprocess`, or
- `uv run postprocess`

# Testing

## Test prerequisites

- `bear` and `c2rust` in path

```
uv run pytest -v
uv run pytest -v tests/test_utils.py # filter tests to run
```

## Misc

- `uv run ruff check --fix .` to format & lint

# TODOs

- testable prototype
- pluggable support for getting definitions
- gemini api support
- filtering by file and function name
- file-based caching of model responses
- openai model support
- antropic model support
- openrouter API support?

2 changes: 2 additions & 0 deletions c2rust-postprocess/c2rust-postprocess
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/sh
uv run postproc
7 changes: 7 additions & 0 deletions c2rust-postprocess/postprocess/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""
c2rust-postprocess: Transfer comments from C functions to Rust functions using LLMs.
"""


def transfer_comments():
pass
46 changes: 46 additions & 0 deletions c2rust-postprocess/postprocess/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import argparse
import logging
import sys
from collections.abc import Sequence

from postprocess.utils import existing_file


def build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Transfer C function comments to Rust using LLMs.",
)
parser.add_argument(
"compile_commands",
type=existing_file,
help="Path to compile_commands.json.",
)

parser.add_argument(
"--log-level",
type=str,
required=False,
default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
help="Logging level (default: INFO)",
)

return parser


def main(argv: Sequence[str] | None = None) -> int:
parser = build_arg_parser()
args = parser.parse_args(argv)

logging.basicConfig(level=logging.getLevelName(args.log_level.upper()))

_compile_commands = args.compile_commands
# TODO: implement post-processing logic using compile_commands



return 0


if __name__ == "__main__":
sys.exit(main())
80 changes: 80 additions & 0 deletions c2rust-postprocess/postprocess/definitions/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import logging
from pathlib import Path
from typing import Any

import tree_sitter_rust as tsrust
from tree_sitter import Language, Parser

def get_c_sourcefile(compile_commands, rustfile: Path) -> Path | None:
c_file_guesses = [rustfile.with_suffix(".c"), rustfile.with_suffix(".C")]

files = [Path(d["file"]) for d in compile_commands]

for guess in c_file_guesses:
if guess in files:
return guess

return None


def get_rust_function_spans(rustfile: Path) -> list[dict[str, Any]]:
LANGUAGE = Language(tsrust.language())
parser = Parser(LANGUAGE)

if not rustfile.exists():
raise FileNotFoundError(f"{rustfile} does not exist")
if not rustfile.is_file():
raise NotADirectoryError(f"{rustfile} is not a file")

try:
with open(rustfile, "rb") as rust_source:
source_bytes = rust_source.read()
except OSError as exc:
logging.error(f"Failed to read Rust file {rustfile}: {exc}")
return []

tree = parser.parse(source_bytes)

functions = []

for node in tree.root_node.children:
if node.type == 'function_item':
name_node = node.child_by_field_name('name')
func_name = (source_bytes[
name_node.start_byte: # type: ignore
name_node.end_byte # type: ignore
].decode('utf-8'))

functions.append({
"name": func_name,
"start_line": node.start_point[0] + 1, # 0-indexed
"end_line": node.end_point[0] + 1, # 0-indexed
"start_byte": node.start_byte,
"end_byte": node.end_byte
})

return functions


def get_c_functions_spans(compile_commands, c_file: Path):
from .clang import get_c_ast_as_json, get_functions_from_clang_ast
cmd = (c for c in compile_commands if c["file"] == str(c_file))
entry = next(cmd, None)

assert entry is not None, f"No compile command entry for {c_file}"

c_fn_asts = get_functions_from_clang_ast(get_c_ast_as_json(entry))

functions = []
for fn in c_fn_asts:
loc = fn["loc"]
if "line" in loc and "col" in loc and "file" in loc:
functions.append({
"name": fn["name"],
"start_line": loc["line"],
"start_byte": fn["range"]["begin"]["offset"],
"end_line": fn["range"]["end"]["line"],
"end_byte": fn["range"]["end"]["offset"],
})


68 changes: 68 additions & 0 deletions c2rust-postprocess/postprocess/definitions/clang/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@

import json
import subprocess
from typing import Any

import jq


def get_functions_from_clang_ast(ast: dict[str, Any]) -> list[dict[str, Any]]:
"""
Extract function declarations from the Clang AST JSON.
Args:
ast (dict): The AST JSON as a dictionary.
Returns:
list[dict]: A list of dictionaries, each representing a function declaration.
"""
query = jq.compile(
'.inner[] | select(.kind =="FunctionDecl") | {name: .name, loc: .loc, range: .range}'
)
return query.transform(ast, multiple_output=True)


def get_c_ast_as_json(entry: dict[str, Any]) -> dict[str, Any]:
"""
Get AST as JSON for a translation unit identified by compile commands entry.
"""
source_file = entry["file"]

cmd = entry["arguments"]
cmd[0] = "clang" # make sure we use clang
# drop the last four elements which are the output options
cmd = cmd[:-4] # TODO: validate that these are the output options
# add the necessary flags to dump the AST as JSON
cmd += [
"-fsyntax-only",
"-Xclang",
"-ast-dump=json",
"-fparse-all-comments", # NOTE: Clang AST only includes doc comments
source_file,
]
try:
# cwd to the directory from the compile_commands.json entry to make sure
# relative paths in the command work correctly
result = subprocess.run(
cmd, capture_output=True, text=True, check=True, cwd=entry["directory"]
)
return json.loads(result.stdout)
except subprocess.CalledProcessError as e:
print(f"Error running clang on {source_file}: {e.stderr}")
raise


def is_entry_from_c_file(entry: dict[str, Any], c_file: str) -> bool:
"""
Check if the entry is from the specified C file.
"""
loc = entry["loc"]
if "file" in loc:
return loc["file"] == c_file
elif "spellingLoc" in loc and "includedFrom" in loc["spellingLoc"]:
return loc["spellingLoc"]["includedFrom"]["file"] == c_file
elif "expansionLoc" in loc and "includedFrom" in loc["expansionLoc"]:
return loc["expansionLoc"]["includedFrom"]["file"] == c_file
if "includedFrom" not in loc:
return (
True # entry was parsed from c_file so by default it is from that file
)
return False
Empty file.
24 changes: 24 additions & 0 deletions c2rust-postprocess/postprocess/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import argparse
from pathlib import Path


def existing_file(value: str) -> Path:
path = Path(value)
if path.is_file():
return path
raise argparse.ArgumentTypeError(f"{value!r} is not a readable file")


def get_rust_files(path: Path) -> list[Path]:
rust_files = []

if not path.exists():
raise FileNotFoundError(f"{path} does not exist")
if not path.is_dir():
raise NotADirectoryError(f"{path} is not a directory")

for file in path.glob("**/*.rs"):
rust_files.append(file)
return rust_files


56 changes: 56 additions & 0 deletions c2rust-postprocess/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
[project]
name = "postprocess"
version = "0.1.0"
description = "c2rust-postprocess"
requires-python = ">=3.12"
dependencies = [
"google-genai",
"jq>=1.10.0",
"pygments>=2.19.2",
"tree-sitter>=0.25.2",
"tree-sitter-rust>=0.24.0",
]

[project.scripts]
postproc = "postprocess.__main__:main" # CLI entry point

[tool.uv]
# let `uv` to manage the environment
package = true

[dependency-groups]
dev = [
"pytest>=9.0.1",
"ruff>=0.14.6",
]

# lets us run tests as `uv run pytest`
[tool.pytest.ini_options]
minversion = "6.0"
# strict-markers: raises error on unregistered markers (catch typos)
addopts = "--strict-markers"
# Only search for tests in the 'tests' folder
testpaths = ["tests"]
# Add current directory to pythonpath so 'postprocess' is importable
pythonpath = ["."]

[tool.ruff]
target-version = "py312"
line-length = 88

[tool.ruff.lint]
select = [
"E", # pycodestyle errors (standard formatting)
"W", # pycodestyle warnings
"F", # Pyflakes (finds unused imports, variables, etc.)
"I", # isort (sorts your imports automatically)
"B", # flake8-bugbear (catches common bugs/pitfalls)
"UP", # pyupgrade (rewrites old syntax to new Python 3.12 syntax)
"SIM", # flake8-simplify (suggests simplifying complex logic)
"N", # pep8-naming (ensures variables use snake_case, classes use CamelCase)
]

ignore = []

[tool.ruff.lint.isort]
known-first-party = ["postprocess"]
Loading
Loading