Skip to content

Commit ab89077

Browse files
fix(build): Better parsing of C type definitions (#1187)
* fix(build): Better parsing of C type definitions Header type definitions generated by cbindgen need to be deduplicated between ddcommon and other crates. Before, the code doing this dedup relied only on regexes, making it unreliable for nested definitons, and it didn't work for type aliases and unions. Now this should work --------- Co-authored-by: Levi Morrison <[email protected]>
1 parent c285c41 commit ab89077

File tree

1 file changed

+237
-64
lines changed

1 file changed

+237
-64
lines changed

tools/src/lib.rs

Lines changed: 237 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,88 @@
22
// SPDX-License-Identifier: Apache-2.0
33

44
pub mod headers {
5-
use regex::{Match, Regex, RegexBuilder};
5+
use regex::{Regex, RegexBuilder};
66
use std::collections::HashSet;
77
use std::fs::{File, OpenOptions};
88
use std::io::{self, BufReader, BufWriter, Read, Seek, Write};
99
use std::sync::LazyLock;
1010

11-
static HEADER_TYPE_DECL_RE: LazyLock<Regex> = LazyLock::new(|| {
12-
RegexBuilder::new(r"^(/\*\*([^*]|\*+[^*/])*\*+/\n)?(#define [a-zA-Z_0-9]+ [^\n]+|typedef (struct|enum) [a-zA-Z_0-9]+ +(\{.*?\} )?[a-zA-Z_0-9]+;)\n+")
13-
.multi_line(true)
14-
.dot_matches_new_line(true)
15-
.build()
16-
.unwrap()
11+
#[derive(Debug, PartialEq, Eq, Hash)]
12+
struct Span<'a> {
13+
start: usize,
14+
end: usize,
15+
str: &'a str,
16+
}
17+
18+
static ITEM_DEFINITION_HEAD: LazyLock<Regex> = LazyLock::new(|| {
19+
RegexBuilder::new(
20+
r"^(?:/\*\*(?:[^*]|\*+[^*/])*\*+/\n)?(?:# *(define [a-zA-Z_0-9]+ [^\n]+)|(typedef))",
21+
)
22+
.multi_line(true)
23+
.dot_matches_new_line(true)
24+
.build()
25+
.unwrap()
1726
});
1827

19-
fn collect_definitions(header: &str) -> Vec<regex::Match<'_>> {
20-
HEADER_TYPE_DECL_RE.find_iter(header).collect()
28+
/// Gather all top level typedef and #define definitions from a C header file
29+
fn collect_definitions(header: &str) -> Vec<Span<'_>> {
30+
let mut items = Vec::new();
31+
let mut start = 0;
32+
33+
loop {
34+
let Some(head) = ITEM_DEFINITION_HEAD.captures_at(header, start) else {
35+
break;
36+
};
37+
start = head.get(0).unwrap().start();
38+
let end: usize;
39+
if let Some(capture) = head.get(2) {
40+
let mut depth: i32 = 0;
41+
let mut typedef_end = None;
42+
for (pos, c) in header.bytes().enumerate().skip(capture.end()) {
43+
match c {
44+
b';' if depth == 0 => {
45+
typedef_end = Some(pos + 1);
46+
break;
47+
}
48+
b'{' => {
49+
depth += 1;
50+
}
51+
b'}' => {
52+
depth = depth
53+
.checked_sub(1)
54+
.expect("Unmatched closing brace in typedef");
55+
}
56+
_ => {}
57+
}
58+
}
59+
let typedef_end = typedef_end.expect("No closing semicolon found for typedef");
60+
end = typedef_end
61+
+ header[typedef_end..]
62+
.bytes()
63+
.take_while(|c| matches!(c, b'\n' | b'\r' | b' '))
64+
.count();
65+
} else if let Some(capture) = head.get(1) {
66+
let define_end = capture.end();
67+
end = define_end
68+
+ header[define_end..]
69+
.bytes()
70+
.take_while(|c| matches!(c, b'\n' | b'\r' | b' '))
71+
.count();
72+
} else {
73+
unreachable!(
74+
"the regex should only capture typedef and #define, got {:?}",
75+
head
76+
);
77+
}
78+
79+
items.push(Span {
80+
start,
81+
end,
82+
str: &header[start..end],
83+
});
84+
start = end;
85+
}
86+
items
2187
}
2288

2389
fn read(f: &mut BufReader<&File>) -> String {
@@ -35,12 +101,12 @@ pub mod headers {
35101
Ok(())
36102
}
37103

38-
fn content_without_defs<'a>(content: &'a str, defs: &[Match]) -> Vec<&'a str> {
104+
fn content_without_defs<'a>(content: &'a str, defs: &[Span]) -> Vec<&'a str> {
39105
let mut new_content_parts = Vec::new();
40106
let mut pos = 0;
41107
for d in defs {
42-
new_content_parts.push(&content[pos..d.start()]);
43-
pos = d.end();
108+
new_content_parts.push(&content[pos..d.start]);
109+
pos = d.end;
44110
}
45111
new_content_parts.push(&content[pos..]);
46112
new_content_parts
@@ -61,7 +127,7 @@ pub mod headers {
61127

62128
child_defs
63129
.into_iter()
64-
.map(|m| m.as_str().to_owned())
130+
.map(|m| m.str.to_owned())
65131
.collect::<Vec<_>>()
66132
}) {
67133
if present.contains(&child_def) {
@@ -79,90 +145,197 @@ pub mod headers {
79145

80146
let base_header_content = read(&mut BufReader::new(&base_header));
81147
let base_defs = collect_definitions(&base_header_content);
82-
let base_defs_set: HashSet<_> = base_defs.iter().map(Match::as_str).collect();
148+
let base_defs_set: HashSet<_> = base_defs.iter().map(|s| s.str).collect();
83149

84-
let mut base_new_parts = vec![&base_header_content[..base_defs.last().unwrap().end()]];
150+
let mut base_new_parts = vec![&base_header_content[..base_defs.last().unwrap().end]];
85151
for child_def in &unique_child_defs {
86152
if base_defs_set.contains(child_def.as_str()) {
87153
continue;
88154
}
89155
base_new_parts.push(child_def);
90156
}
91-
base_new_parts.push(&base_header_content[base_defs.last().unwrap().end()..]);
157+
base_new_parts.push(&base_header_content[base_defs.last().unwrap().end..]);
92158
write_parts(&mut BufWriter::new(&base_header), &base_new_parts).unwrap();
93159
}
94160

95161
#[cfg(test)]
96162
mod tests {
97163
use super::*;
98164

99-
#[ignore]
165+
#[track_caller]
166+
fn test_regex_match(input: &str, expected: Vec<&str>) {
167+
let matches = collect_definitions(input);
168+
assert_eq!(
169+
matches.len(),
170+
expected.len(),
171+
"Expected:\n{:#?}\nActual:\n{:#?}",
172+
expected,
173+
matches
174+
);
175+
for (i, m) in matches.iter().enumerate() {
176+
assert_eq!(m.str, expected[i]);
177+
}
178+
}
179+
100180
#[test]
101-
fn collect_definitions_comments() {
102-
let header = r"/**
103-
* `QueueId` is a struct that represents a unique identifier for a queue.
104-
* It contains a single field, `inner`, which is a 64-bit unsigned integer.
105-
*/
106-
typedef uint64_t ddog_QueueId;
107-
108-
/**
109-
* Holds the raw parts of a Rust Vec; it should only be created from Rust,
110-
* never from C.
111-
**/
112-
typedef struct ddog_Vec_U8 {
113-
const uint8_t *ptr;
114-
uintptr_t len;
115-
uintptr_t capacity;
116-
} ddog_Vec_U8;
181+
fn collect_typedef() {
182+
let input = "typedef void *Foo;\n";
183+
let expected = vec!["typedef void *Foo;\n"];
184+
test_regex_match(input, expected);
185+
}
186+
187+
#[test]
188+
fn collect_typedef_comment() {
189+
let input = r"
190+
/**
191+
* This is a typedef for a pointer to Foo.
192+
*/
193+
typedef void *Foo;
194+
";
195+
let expected = vec![
196+
r"/**
197+
* This is a typedef for a pointer to Foo.
198+
*/
199+
typedef void *Foo;
200+
",
201+
];
202+
test_regex_match(input, expected);
203+
}
204+
205+
#[test]
206+
fn collect_struct_typedef() {
207+
let input = r"/**
208+
* This is a typedef for a pointer to a struct.
209+
*/
210+
typedef struct ddog_Vec_U8 {
211+
const uint8_t *ptr;
212+
uintptr_t len;
213+
uintptr_t capacity;
214+
} ddog_Vec_U8;
215+
";
216+
let expected = vec![input];
217+
test_regex_match(input, expected);
218+
}
219+
220+
#[test]
221+
fn collect_union_typedef() {
222+
let input = r"/**
223+
* This is a typedef for a pointer to a union.
224+
*/
225+
typedef union my_union {
226+
int a;
227+
float b;
228+
} my_union;
229+
";
230+
let expected = vec![input];
231+
test_regex_match(input, expected);
232+
}
233+
234+
#[test]
235+
fn collect_union_nested() {
236+
let input = r"typedef union ddog_Union_U8 {
237+
struct inner1 {
238+
const uint8_t *ptr;
239+
uintptr_t len;
240+
uintptr_t capacity;
241+
} inner;
242+
struct inner2 {
243+
const uint8_t *ptr;
244+
uintptr_t len;
245+
uintptr_t capacity;
246+
} inner2;
247+
} ddog_Union_U8;
248+
";
249+
let expected = vec![input];
250+
test_regex_match(input, expected);
251+
}
252+
253+
#[test]
254+
fn collect_define() {
255+
let input = r#"#define FOO __attribute__((unused))
256+
"#;
257+
let expected = vec![input];
258+
test_regex_match(input, expected);
259+
}
260+
261+
#[test]
262+
fn collect_multiple_definitions() {
263+
let input = r"
264+
/**
265+
* `QueueId` is a struct that represents a unique identifier for a queue.
266+
* It contains a single field, `inner`, which is a 64-bit unsigned integer.
267+
*/
268+
typedef uint64_t ddog_QueueId;
269+
270+
void foo() {
271+
}
272+
273+
/**
274+
* Holds the raw parts of a Rust Vec; it should only be created from Rust,
275+
* never from C.
276+
**/
277+
typedef struct ddog_Vec_U8 {
278+
const uint8_t *ptr;
279+
uintptr_t len;
280+
uintptr_t capacity;
281+
} ddog_Vec_U8;
117282
";
118-
let matches = collect_definitions(header);
119283

120-
assert_eq!(matches.len(), 1);
121-
assert_eq!(
122-
matches[0].as_str(),
284+
let expected = vec![
123285
r"/**
124-
* Holds the raw parts of a Rust Vec; it should only be created from Rust,
125-
* never from C.
126-
**/
127-
typedef struct ddog_Vec_U8 {
128-
const uint8_t *ptr;
129-
uintptr_t len;
130-
uintptr_t capacity;
131-
} ddog_Vec_U8;
132-
"
133-
);
286+
* `QueueId` is a struct that represents a unique identifier for a queue.
287+
* It contains a single field, `inner`, which is a 64-bit unsigned integer.
288+
*/
289+
typedef uint64_t ddog_QueueId;
290+
291+
",
292+
r"/**
293+
* Holds the raw parts of a Rust Vec; it should only be created from Rust,
294+
* never from C.
295+
**/
296+
typedef struct ddog_Vec_U8 {
297+
const uint8_t *ptr;
298+
uintptr_t len;
299+
uintptr_t capacity;
300+
} ddog_Vec_U8;
301+
",
302+
];
303+
test_regex_match(input, expected);
304+
}
134305

306+
#[test]
307+
fn collect_definitions_comments() {
135308
let header = r"/** foo */
136-
typedef struct ddog_Vec_U8 {
137-
const uint8_t *ptr;
138-
} ddog_Vec_U8;
139-
";
309+
typedef struct ddog_Vec_U8 {
310+
const uint8_t *ptr;
311+
} ddog_Vec_U8;
312+
";
140313
let matches = collect_definitions(header);
141314

142315
assert_eq!(matches.len(), 1);
143316
assert_eq!(
144-
matches[0].as_str(),
317+
matches[0].str,
145318
r"/** foo */
146-
typedef struct ddog_Vec_U8 {
147-
const uint8_t *ptr;
148-
} ddog_Vec_U8;
149-
"
319+
typedef struct ddog_Vec_U8 {
320+
const uint8_t *ptr;
321+
} ddog_Vec_U8;
322+
"
150323
);
151324

152325
let header = r"/** foo **/ */
153-
typedef struct ddog_Vec_U8 {
154-
const uint8_t *ptr;
155-
} ddog_Vec_U8;
156-
";
326+
typedef struct ddog_Vec_U8 {
327+
const uint8_t *ptr;
328+
} ddog_Vec_U8;
329+
";
157330
let matches = collect_definitions(header);
158331

159332
assert_eq!(matches.len(), 1);
160333
assert_eq!(
161-
matches[0].as_str(),
334+
matches[0].str,
162335
r"typedef struct ddog_Vec_U8 {
163-
const uint8_t *ptr;
164-
} ddog_Vec_U8;
165-
"
336+
const uint8_t *ptr;
337+
} ddog_Vec_U8;
338+
"
166339
);
167340
}
168341
}

0 commit comments

Comments
 (0)