Skip to content

Commit b648f56

Browse files
authored
postprocess: add --exclude-file for skipping functions (#1541)
As a general-purpose solution to bypassing functions the postprocessor's comment transferer (and future transforms, too) can't handle yet, this adds an exclude list to skip functions in files. There is also `--ident-filter`, which is a single regex just based on the function name, but this is more specific and also takes into account the file name. The file passed to `--exclude-file` is a YAML file containing file paths (relative to the exclude file's directory) and the function names within them to skip. This gets us around issues like conditionally-compiled comments (#1534), the first function in a file also including license comments and `#include`s, and anything else. I've run this on `json-c` so far ```sh (cd c2rust-postprocess/ && uv run postprocess $OLDPWD/tests/integration/tests/json-c/repo/lib.rs --no-update-rust --exclude-file $OLDPWD/tests/integration/tests/json-c/postprocess-exclude.yml) ``` and checked-in the LLM cache for it, but setting up the testing infrastructure to run this automatically is still a WIP (should be the next PR). Funnily enough, the LLM cache is going to end up behaving similarly to snapshot tests once it's up and running.
2 parents e6d4921 + 53c18d1 commit b648f56

File tree

137 files changed

+19370
-24
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

137 files changed

+19370
-24
lines changed

c2rust-postprocess/README.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,39 @@ accelerate the types of translation and migration that help move C code to Rust.
1717
- `c2rust-postprocess path/to/transpiled_rust.rs`, or
1818
- `uv run postprocess path/to/transpiled_rust.rs`
1919

20+
## Excluding/Filtering
21+
22+
`c2rust-postprocess` has a few ways to filter/exclude the function identifiers that are processed.
23+
24+
`--ident-filter` simply takes a regex.
25+
Anything matching the regex is filtered out of being processed.
26+
This is very useful for on-the-fly filtering that's easy to change quickly.
27+
28+
`--exclude-file` is for more granular, more permanent filtering/exclusion.
29+
It takes a path to an exclude file, which we tend to call `postprocess-exclude.yml`.
30+
This is a YAML file containing the file paths and function identifiers to exclude.
31+
For example,
32+
33+
```yaml
34+
src/lib.rs:
35+
- foo
36+
- bar
37+
38+
src/main.rs:
39+
- main
40+
```
41+
42+
will exclude `fn foo` and `fn bar` in `src/lib.rs`
43+
(resolved relative to the directory of `postprocess-exclude.yml`)
44+
and `fn main` in `src/main.rs`.
45+
46+
We filter based on the Rust file instead of the C file
47+
because the C file may not be available and isn't read by `c2rust-postprocess`
48+
(only the adjacent `*.c_decls.json` file is).
49+
50+
These file paths and function identifiers must match fully.
51+
They are not regexes or globs.
52+
2053
# Testing
2154

2255
## Test prerequisites

c2rust-postprocess/postprocess/__init__.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@
66
import logging
77
from argparse import BooleanOptionalAction
88
from collections.abc import Sequence
9+
from pathlib import Path
910

1011
from google.genai import types
1112

1213
from postprocess.cache import DirectoryCache, FrozenCache
14+
from postprocess.exclude_list import IdentifierExcludeList
1315
from postprocess.models import api_key_from_env, get_model_by_id
1416
from postprocess.models.mock import MockGenerativeModel
1517
from postprocess.transforms import (
@@ -45,7 +47,17 @@ def build_arg_parser() -> argparse.ArgumentParser:
4547
type=str,
4648
required=False,
4749
default=None,
48-
help="Regular expression to filter function identifiers to process",
50+
help="Regular expression to filter function identifiers to process"
51+
" (see README for more info)",
52+
)
53+
54+
parser.add_argument(
55+
"--exclude-file",
56+
type=Path,
57+
required=False,
58+
default=None,
59+
help="A YAML file of file paths and identifiers within them to exclude/skip"
60+
" (see README for more info)",
4961
)
5062

5163
parser.add_argument(
@@ -116,6 +128,7 @@ def main(argv: Sequence[str] | None = None):
116128
xform = CommentTransfer(cache, model)
117129
xform.transfer_comments_dir(
118130
root_rust_source_file=args.root_rust_source_file,
131+
exclude_list=IdentifierExcludeList(src_path=args.exclude_file),
119132
ident_filter=args.ident_filter,
120133
update_rust=args.update_rust,
121134
)

c2rust-postprocess/postprocess/cache.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
from platformdirs import user_cache_dir
1212
from tomlkit.items import String
1313

14+
from postprocess.utils import check_isinstance
15+
1416

1517
class AbstractCache(ABC):
1618
"""
@@ -94,12 +96,13 @@ def convert_value(value: TomlValue) -> TomlValue | String:
9496
return {k: convert_value(v) for k, v in value.items() if v is not None}
9597
elif isinstance(value, list):
9698
return [convert_value(e) for e in value if e is not None]
99+
elif value is None:
100+
raise TypeError("top-level `None` `TomlValue`s are not allowed")
97101
else:
98-
assert value is not None
99102
return value
100103

101104
converted_value = convert_value(value)
102-
assert isinstance(converted_value, dict)
105+
converted_value = check_isinstance(converted_value, dict)
103106
doc = tomlkit.document()
104107
for k, v in converted_value.items():
105108
doc[k] = v
@@ -173,9 +176,16 @@ def lookup(
173176
try:
174177
toml = cache_file.read_text()
175178
except FileNotFoundError:
176-
logging.debug(f"Cache miss: {cache_file}: {messages}")
179+
data = {
180+
"transform": transform,
181+
"identifier": identifier,
182+
"model": model,
183+
"messages": messages,
184+
}
185+
toml = to_multiline_toml(data)
186+
logging.debug(f"Cache miss: {cache_file}:\n{toml}")
177187
return None
178-
logging.debug(f"Cache hit: {cache_file}")
188+
logging.debug(f"Cache hit: {cache_file}:\n{toml}")
179189
data = tomli.loads(toml)
180190

181191
return data["response"]
@@ -206,7 +216,7 @@ def update(
206216
response_path = cache_dir / "response.txt"
207217
metadata_path.write_text(toml)
208218
response_path.write_text(response)
209-
logging.debug(f"Cache updated: {cache_dir}")
219+
logging.debug(f"Cache updated: {cache_dir}:\n{toml}")
210220

211221
def clear(self) -> None:
212222
self._path.unlink(missing_ok=True)

c2rust-postprocess/postprocess/definitions/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ def get_comments_text(comments: Iterable[str]) -> list[str]:
174174
comment_line
175175
for comment in comments
176176
for comment_line in get_comment_text(comment)
177+
if comment_line
177178
]
178179

179180

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from pathlib import Path
2+
3+
import yaml
4+
5+
from postprocess.utils import check_isinstance
6+
7+
8+
class IdentifierExcludeList:
9+
src_path: Path | None
10+
paths: dict[Path, set[str]]
11+
12+
def __init__(self, src_path: Path | None) -> None:
13+
self.src_path = src_path
14+
self.paths = {}
15+
if src_path is None:
16+
return
17+
data = yaml.safe_load(src_path.read_text())
18+
data = check_isinstance(data, dict)
19+
for path, identifiers in data.items():
20+
path = check_isinstance(path, str)
21+
identifiers = check_isinstance(identifiers, list)
22+
identifiers = [check_isinstance(ident, str) for ident in identifiers]
23+
path = Path(path)
24+
existing_identifiers = self.paths.get(path)
25+
if existing_identifiers is None:
26+
self.paths[path] = set(identifiers)
27+
else:
28+
existing_identifiers.update(*identifiers)
29+
30+
def contains(self, path: Path, identifier: str) -> bool:
31+
# No `src_path` means an empty exclude list.
32+
if self.src_path is None:
33+
return False
34+
# Consider paths relative to `src_path`, the location of the exclude file.
35+
rel_path: Path = path.relative_to(self.src_path.parent)
36+
identifiers = self.paths.get(rel_path, set())
37+
return identifier in identifiers

c2rust-postprocess/postprocess/transforms.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
get_rust_definitions,
1212
update_rust_definition,
1313
)
14+
from postprocess.exclude_list import IdentifierExcludeList
1415
from postprocess.models import AbstractGenerativeModel
1516
from postprocess.utils import get_highlighted_c, get_highlighted_rust, remove_backticks
1617

@@ -58,10 +59,11 @@ def transfer_comments(
5859
self,
5960
root_rust_source_file: Path,
6061
rust_source_file: Path,
62+
exclude_list: IdentifierExcludeList,
6163
ident_filter: str | None = None,
6264
update_rust: bool = True,
6365
) -> None:
64-
pattern = re.compile(ident_filter) if ident_filter else None
66+
ident_regex = re.compile(ident_filter) if ident_filter else None
6567

6668
rust_definitions = get_rust_definitions(rust_source_file)
6769
c_definitions = get_c_definitions(rust_source_file)
@@ -71,7 +73,17 @@ def transfer_comments(
7173

7274
prompts: list[CommentTransferPrompt] = []
7375
for identifier, rust_definition in rust_definitions.items():
74-
if pattern and not pattern.search(identifier):
76+
if exclude_list.contains(path=rust_source_file, identifier=identifier):
77+
logging.info(
78+
f"Skipping Rust fn {identifier} in {rust_source_file}"
79+
f"due to exclude file {exclude_list.src_path}"
80+
)
81+
continue
82+
if ident_regex and not ident_regex.search(identifier):
83+
logging.info(
84+
f"Skipping Rust fn {identifier} in {rust_source_file}"
85+
f"due to ident filter {ident_filter}"
86+
)
7587
continue
7688

7789
rust_comments = get_rust_comments(rust_definition)
@@ -170,6 +182,7 @@ def transfer_comments(
170182
def transfer_comments_dir(
171183
self,
172184
root_rust_source_file: Path,
185+
exclude_list: IdentifierExcludeList,
173186
ident_filter: str | None = None,
174187
update_rust: bool = True,
175188
):
@@ -187,6 +200,7 @@ def transfer_comments_dir(
187200
self.transfer_comments(
188201
root_rust_source_file=root_rust_source_file,
189202
rust_source_file=rs_path,
203+
exclude_list=exclude_list,
190204
ident_filter=ident_filter,
191205
update_rust=update_rust,
192206
)

c2rust-postprocess/postprocess/utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@
1313
from requests.exceptions import JSONDecodeError
1414

1515

16+
def check_isinstance[T](value: object, typ: type[T]) -> T:
17+
if not isinstance(value, typ):
18+
raise TypeError(f"Expected {typ}, got {type(value)}")
19+
return value
20+
21+
1622
def get_tool_path(tool_name: str) -> Path:
1723
"""Get the path to the given tool in the system PATH."""
1824

c2rust-postprocess/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ dependencies = [
88
"openai>=2.9.0",
99
"platformdirs>=4.5.1",
1010
"pygments>=2.19.2",
11+
"pyyaml>=6.0.3",
1112
"tomli>=2.3.0",
1213
"tomlkit>=0.13.3",
1314
"tree-sitter>=0.25.2",

c2rust-postprocess/tests/examples/qsort.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,7 @@
77
unused_mut
88
)]
99
#[no_mangle]
10-
pub unsafe extern "C" fn swap(
11-
mut a: *mut ::core::ffi::c_int,
12-
mut b: *mut ::core::ffi::c_int,
13-
) {
10+
pub unsafe extern "C" fn swap(mut a: *mut ::core::ffi::c_int, mut b: *mut ::core::ffi::c_int) {
1411
let mut t: ::core::ffi::c_int = *a;
1512
*a = *b;
1613
*b = t;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
transform = "CommentTransfer"
2+
identifier = "_json_c_set_last_err"
3+
model = "gemini-3-flash-preview"
4+
response = """#[no_mangle]
5+
6+
pub unsafe extern "C" fn _json_c_set_last_err(
7+
mut err_fmt: *const ::core::ffi::c_char,
8+
mut args: ...
9+
) {
10+
let mut ap: ::core::ffi::VaListImpl;
11+
ap = args.clone();
12+
// Ignore (attempted) overruns from snprintf
13+
vsnprintf(
14+
&raw mut _last_err as *mut ::core::ffi::c_char,
15+
::core::mem::size_of::<[::core::ffi::c_char; 256]>() as size_t,
16+
err_fmt,
17+
ap.as_va_list(),
18+
);
19+
}"""
20+
21+
[[messages]]
22+
role = "user"
23+
content = """Transfer the comments from the following C function to the corresponding Rust function.
24+
Do not add any comments that are not present in the C function.
25+
Use Rust doc comment syntax (///) where appropriate (e.g., for function documentation).
26+
Respond with the Rust function definition with the transferred comments; say nothing else.
27+
28+
C function:
29+
```c
30+
void _json_c_set_last_err(const char *err_fmt, ...)
31+
{
32+
\tva_list ap;
33+
\tva_start(ap, err_fmt);
34+
\t// Ignore (attempted) overruns from snprintf
35+
\t(void)vsnprintf(_last_err, sizeof(_last_err), err_fmt, ap);
36+
\tva_end(ap);
37+
}```
38+
39+
Rust function:
40+
```rust
41+
#[no_mangle]
42+
43+
pub unsafe extern "C" fn _json_c_set_last_err(
44+
mut err_fmt: *const ::core::ffi::c_char,
45+
mut args: ...
46+
) {
47+
let mut ap: ::core::ffi::VaListImpl;
48+
ap = args.clone();
49+
vsnprintf(
50+
&raw mut _last_err as *mut ::core::ffi::c_char,
51+
::core::mem::size_of::<[::core::ffi::c_char; 256]>() as size_t,
52+
err_fmt,
53+
ap.as_va_list(),
54+
);
55+
}```
56+
"""

0 commit comments

Comments
 (0)