Skip to content

Commit 039dea2

Browse files
committed
Set up initial project structure
Also add bare-bones test which can be run like so: ``` uvx pytest tests/test_postprocess.py ``` as long as `uvx` is in path.
1 parent 01b2aee commit 039dea2

File tree

14 files changed

+1097
-0
lines changed

14 files changed

+1097
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
**/tests/
1111
/build
1212
*.pyc
13+
**/__pycache__
14+
*.egg-info/
1315
.vagrant
1416
**/compile_commands.json
1517
.python-version

c2rust-postprocess/README.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# LLM-based postprocessing of c2rust transpiler output
2+
3+
This is currently a prototype effort to gauge the extent to which LLMs can
4+
accelerate the types of translation and migration that help move C code to Rust.
5+
6+
# Prerequisites
7+
8+
- Python 3.12 or later
9+
- `uv` in path
10+
- A valid `GEMINI_API_KEY` set
11+
- A transpiled codebase with a correct `compile_commands.json`
12+
13+
# Running
14+
15+
- `c2rust-postprocess`, or
16+
- `uv run postprocess`
17+
18+
# Testing
19+
20+
## Test prerequisites
21+
22+
- `bear` and `c2rust` in path
23+
24+
```
25+
uv run pytest -v
26+
uv run pytest -v tests/test_utils.py # filter tests to run
27+
```
28+
29+
## Misc
30+
31+
- `uv run ruff check --fix .` to format & lint
32+
33+
# TODOs
34+
35+
- testable prototype
36+
- pluggable support for getting definitions
37+
- gemini api support
38+
- filtering by file and function name
39+
- file-based caching of model responses
40+
- openai model support
41+
- antropic model support
42+
- openrouter API support?
43+
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#!/bin/sh
2+
uv run postproc
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"""
2+
c2rust-postprocess: Transfer comments from C functions to Rust functions using LLMs.
3+
"""
4+
5+
6+
def transfer_comments():
7+
pass
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import argparse
2+
import logging
3+
import sys
4+
from collections.abc import Sequence
5+
6+
from postprocess.utils import existing_file
7+
8+
9+
def build_arg_parser() -> argparse.ArgumentParser:
10+
parser = argparse.ArgumentParser(
11+
description="Transfer C function comments to Rust using LLMs.",
12+
)
13+
parser.add_argument(
14+
"compile_commands",
15+
type=existing_file,
16+
help="Path to compile_commands.json.",
17+
)
18+
19+
parser.add_argument(
20+
"--log-level",
21+
type=str,
22+
required=False,
23+
default="INFO",
24+
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
25+
help="Logging level (default: INFO)",
26+
)
27+
28+
return parser
29+
30+
31+
def main(argv: Sequence[str] | None = None) -> int:
32+
parser = build_arg_parser()
33+
args = parser.parse_args(argv)
34+
35+
logging.basicConfig(level=logging.getLevelName(args.log_level.upper()))
36+
37+
_compile_commands = args.compile_commands
38+
# TODO: implement post-processing logic using compile_commands
39+
40+
41+
42+
return 0
43+
44+
45+
if __name__ == "__main__":
46+
sys.exit(main())
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import logging
2+
from pathlib import Path
3+
from typing import Any
4+
5+
import tree_sitter_rust as tsrust
6+
from tree_sitter import Language, Parser
7+
8+
def get_c_sourcefile(compile_commands, rustfile: Path) -> Path | None:
9+
c_file_guesses = [rustfile.with_suffix(".c"), rustfile.with_suffix(".C")]
10+
11+
files = [Path(d["file"]) for d in compile_commands]
12+
13+
for guess in c_file_guesses:
14+
if guess in files:
15+
return guess
16+
17+
return None
18+
19+
20+
def get_rust_function_spans(rustfile: Path) -> list[dict[str, Any]]:
21+
LANGUAGE = Language(tsrust.language())
22+
parser = Parser(LANGUAGE)
23+
24+
if not rustfile.exists():
25+
raise FileNotFoundError(f"{rustfile} does not exist")
26+
if not rustfile.is_file():
27+
raise NotADirectoryError(f"{rustfile} is not a file")
28+
29+
try:
30+
with open(rustfile, "rb") as rust_source:
31+
source_bytes = rust_source.read()
32+
except OSError as exc:
33+
logging.error(f"Failed to read Rust file {rustfile}: {exc}")
34+
return []
35+
36+
tree = parser.parse(source_bytes)
37+
38+
functions = []
39+
40+
for node in tree.root_node.children:
41+
if node.type == 'function_item':
42+
name_node = node.child_by_field_name('name')
43+
func_name = (source_bytes[
44+
name_node.start_byte: # type: ignore
45+
name_node.end_byte # type: ignore
46+
].decode('utf-8'))
47+
48+
functions.append({
49+
"name": func_name,
50+
"start_line": node.start_point[0] + 1, # 0-indexed
51+
"end_line": node.end_point[0] + 1, # 0-indexed
52+
"start_byte": node.start_byte,
53+
"end_byte": node.end_byte
54+
})
55+
56+
return functions
57+
58+
59+
def get_c_functions_spans(compile_commands, c_file: Path):
60+
from .clang import get_c_ast_as_json, get_functions_from_clang_ast
61+
cmd = (c for c in compile_commands if c["file"] == str(c_file))
62+
entry = next(cmd, None)
63+
64+
assert entry is not None, f"No compile command entry for {c_file}"
65+
66+
c_fn_asts = get_functions_from_clang_ast(get_c_ast_as_json(entry))
67+
68+
functions = []
69+
for fn in c_fn_asts:
70+
loc = fn["loc"]
71+
if "line" in loc and "col" in loc and "file" in loc:
72+
functions.append({
73+
"name": fn["name"],
74+
"start_line": loc["line"],
75+
"start_byte": fn["range"]["begin"]["offset"],
76+
"end_line": fn["range"]["end"]["line"],
77+
"end_byte": fn["range"]["end"]["offset"],
78+
})
79+
80+
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
2+
import json
3+
import subprocess
4+
from typing import Any
5+
6+
import jq
7+
8+
9+
def get_functions_from_clang_ast(ast: dict[str, Any]) -> list[dict[str, Any]]:
10+
"""
11+
Extract function declarations from the Clang AST JSON.
12+
Args:
13+
ast (dict): The AST JSON as a dictionary.
14+
Returns:
15+
list[dict]: A list of dictionaries, each representing a function declaration.
16+
"""
17+
query = jq.compile(
18+
'.inner[] | select(.kind =="FunctionDecl") | {name: .name, loc: .loc, range: .range}'
19+
)
20+
return query.transform(ast, multiple_output=True)
21+
22+
23+
def get_c_ast_as_json(entry: dict[str, Any]) -> dict[str, Any]:
24+
"""
25+
Get AST as JSON for a translation unit identified by compile commands entry.
26+
"""
27+
source_file = entry["file"]
28+
29+
cmd = entry["arguments"]
30+
cmd[0] = "clang" # make sure we use clang
31+
# drop the last four elements which are the output options
32+
cmd = cmd[:-4] # TODO: validate that these are the output options
33+
# add the necessary flags to dump the AST as JSON
34+
cmd += [
35+
"-fsyntax-only",
36+
"-Xclang",
37+
"-ast-dump=json",
38+
"-fparse-all-comments", # NOTE: Clang AST only includes doc comments
39+
source_file,
40+
]
41+
try:
42+
# cwd to the directory from the compile_commands.json entry to make sure
43+
# relative paths in the command work correctly
44+
result = subprocess.run(
45+
cmd, capture_output=True, text=True, check=True, cwd=entry["directory"]
46+
)
47+
return json.loads(result.stdout)
48+
except subprocess.CalledProcessError as e:
49+
print(f"Error running clang on {source_file}: {e.stderr}")
50+
raise
51+
52+
53+
def is_entry_from_c_file(entry: dict[str, Any], c_file: str) -> bool:
54+
"""
55+
Check if the entry is from the specified C file.
56+
"""
57+
loc = entry["loc"]
58+
if "file" in loc:
59+
return loc["file"] == c_file
60+
elif "spellingLoc" in loc and "includedFrom" in loc["spellingLoc"]:
61+
return loc["spellingLoc"]["includedFrom"]["file"] == c_file
62+
elif "expansionLoc" in loc and "includedFrom" in loc["expansionLoc"]:
63+
return loc["expansionLoc"]["includedFrom"]["file"] == c_file
64+
if "includedFrom" not in loc:
65+
return (
66+
True # entry was parsed from c_file so by default it is from that file
67+
)
68+
return False

c2rust-postprocess/postprocess/transforms/__init__.py

Whitespace-only changes.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import argparse
2+
from pathlib import Path
3+
4+
5+
def existing_file(value: str) -> Path:
6+
path = Path(value)
7+
if path.is_file():
8+
return path
9+
raise argparse.ArgumentTypeError(f"{value!r} is not a readable file")
10+
11+
12+
def get_rust_files(path: Path) -> list[Path]:
13+
rust_files = []
14+
15+
if not path.exists():
16+
raise FileNotFoundError(f"{path} does not exist")
17+
if not path.is_dir():
18+
raise NotADirectoryError(f"{path} is not a directory")
19+
20+
for file in path.glob("**/*.rs"):
21+
rust_files.append(file)
22+
return rust_files
23+
24+

c2rust-postprocess/pyproject.toml

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
[project]
2+
name = "postprocess"
3+
version = "0.1.0"
4+
description = "c2rust-postprocess"
5+
requires-python = ">=3.12"
6+
dependencies = [
7+
"google-genai",
8+
"jq>=1.10.0",
9+
"pygments>=2.19.2",
10+
"tree-sitter>=0.25.2",
11+
"tree-sitter-rust>=0.24.0",
12+
]
13+
14+
[project.scripts]
15+
postproc = "postprocess.__main__:main" # CLI entry point
16+
17+
[tool.uv]
18+
# let `uv` to manage the environment
19+
package = true
20+
21+
[dependency-groups]
22+
dev = [
23+
"pytest>=9.0.1",
24+
"ruff>=0.14.6",
25+
]
26+
27+
# lets us run tests as `uv run pytest`
28+
[tool.pytest.ini_options]
29+
minversion = "6.0"
30+
# strict-markers: raises error on unregistered markers (catch typos)
31+
addopts = "--strict-markers"
32+
# Only search for tests in the 'tests' folder
33+
testpaths = ["tests"]
34+
# Add current directory to pythonpath so 'postprocess' is importable
35+
pythonpath = ["."]
36+
37+
[tool.ruff]
38+
target-version = "py312"
39+
line-length = 88
40+
41+
[tool.ruff.lint]
42+
select = [
43+
"E", # pycodestyle errors (standard formatting)
44+
"W", # pycodestyle warnings
45+
"F", # Pyflakes (finds unused imports, variables, etc.)
46+
"I", # isort (sorts your imports automatically)
47+
"B", # flake8-bugbear (catches common bugs/pitfalls)
48+
"UP", # pyupgrade (rewrites old syntax to new Python 3.12 syntax)
49+
"SIM", # flake8-simplify (suggests simplifying complex logic)
50+
"N", # pep8-naming (ensures variables use snake_case, classes use CamelCase)
51+
]
52+
53+
ignore = []
54+
55+
[tool.ruff.lint.isort]
56+
known-first-party = ["postprocess"]

0 commit comments

Comments
 (0)