Skip to content

Commit 59c660d

Browse files
author
Adrian
committed
(More) proper(?) handling of static text.
- Generally cleaner dfa and lexer simulation interface/usage.
1 parent 8a74358 commit 59c660d

File tree

11 files changed

+199
-147
lines changed

11 files changed

+199
-147
lines changed

rust/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,4 @@ crate-type = ["lib", "staticlib"]
1010
cbindgen = "0.29"
1111

1212
[dependencies]
13-
libc = "0.2"
1413
nom = "8"

rust/Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,11 @@ default:
55
# https://www.gnu.org/software/make/manual/html_node/Force-Targets.html
66
FORCE:
77

8-
examples:
8+
examples: target/debug/liblog_mechanic.a
99
$(MAKE) --directory $@
1010

1111
include/log_mechanic.generated.hpp: FORCE
1212
cbindgen --config cbindgen.toml --crate log-mechanic > $@
13+
14+
target/debug/liblog_mechanic.a: FORCE
15+
cargo build

rust/cbindgen.toml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ renaming_overrides_prefixing = false
7575

7676
[export.pre_body]
7777
"CSlice" = """
78+
// Custom
7879
CSlice(std::string_view const& view)
7980
requires std::is_same_v<T, char>
8081
: CSlice(view.data(), view.size())
@@ -84,10 +85,15 @@ CSlice(char const* c_str)
8485
requires std::is_same_v<T, char>
8586
: CSlice(std::string_view { c_str })
8687
{}
88+
89+
// Generated
8790
"""
8891

89-
"LogComponent" = """
90-
LogComponent() = default;
92+
"CLogFragment" = """
93+
// Custom
94+
CLogFragment() = default;
95+
96+
// Generated
9197
"""
9298

9399
[export.body]

rust/examples/cpp_usage/usage.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,17 @@
66
using namespace clp::log_mechanic;
77

88
int main() {
9-
Schema* schema { clp_log_mechanic_schema_new() };
9+
Box<Schema> schema { clp_log_mechanic_schema_new() };
1010

1111
clp_log_mechanic_schema_add_rule(schema, "hello", "abc|def");
1212

13-
Lexer* lexer { clp_log_mechanic_lexer_new(schema) };
13+
Box<Lexer> lexer { clp_log_mechanic_lexer_new(schema) };
1414

15-
LogComponent component {};
1615
size_t pos { 0 };
1716

18-
clp_log_mechanic_lexer_next_token(lexer, "def", &pos, &component);
19-
assert(component.rule == 1);
20-
assert(component.start + 3 == component.end);
17+
CLogFragment fragment { clp_log_mechanic_lexer_next_fragment(lexer, "def", &pos) };
18+
assert(fragment.rule == 1);
19+
assert(fragment.start + 3 == fragment.end);
2120

2221
printf("good!\n");
2322

rust/include/log_mechanic.generated.hpp

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ struct Schema;
1414

1515
template<typename T>
1616
struct CSlice {
17+
// Custom
1718
CSlice(std::string_view const& view)
1819
requires std::is_same_v<T, char>
1920
: CSlice(view.data(), view.size())
@@ -23,6 +24,8 @@ CSlice(char const* c_str)
2324
requires std::is_same_v<T, char>
2425
: CSlice(std::string_view { c_str })
2526
{}
27+
28+
// Generated
2629
const T *pointer;
2730
size_t length;
2831

@@ -38,28 +41,28 @@ using CStringView = CSlice<char>;
3841

3942
struct Capture {
4043
CStringView name;
41-
const uint8_t *start;
42-
const uint8_t *end;
44+
CStringView lexeme;
4345

4446
Capture(CStringView const& name,
45-
const uint8_t *const& start,
46-
const uint8_t *const& end)
47+
CStringView const& lexeme)
4748
: name(name),
48-
start(start),
49-
end(end)
49+
lexeme(lexeme)
5050
{}
5151

5252
};
5353

54-
struct LogComponent {
55-
LogComponent() = default;
54+
struct CLogFragment {
55+
// Custom
56+
CLogFragment() = default;
57+
58+
// Generated
5659
size_t rule;
5760
const uint8_t *start;
5861
const uint8_t *end;
5962
const Capture *captures;
6063
size_t captures_count;
6164

62-
LogComponent(size_t const& rule,
65+
CLogFragment(size_t const& rule,
6366
const uint8_t *const& start,
6467
const uint8_t *const& end,
6568
const Capture *const& captures,
@@ -80,10 +83,13 @@ void clp_log_mechanic_lexer_delete(Box<Lexer> lexer);
8083

8184
Box<Lexer> clp_log_mechanic_lexer_new(const Schema *schema);
8285

83-
bool clp_log_mechanic_lexer_next_token(Lexer *lexer,
84-
CStringView input,
85-
size_t *pos,
86-
LogComponent *log_component);
86+
/// Very unsafe!
87+
///
88+
/// The returned [`CLogFragment`] includes a hidden exclusive borrow of `lexer`
89+
/// (it contains a pointer into an interal buffer of `lexer`),
90+
/// so it is nolonger valid/you must not use it after a subsequent exclusive borrow of `lexer`
91+
/// (i.e. this borrow has ended).
92+
CLogFragment clp_log_mechanic_lexer_next_fragment(Lexer *lexer, CStringView input, size_t *pos);
8793

8894
void clp_log_mechanic_schema_add_rule(Schema *schema, CStringView name, CStringView pattern);
8995

rust/include/log_mechanic.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
namespace clp::log_mechanic {
55

66
template<typename T> using Box = T*;
7-
87
}
98

109
#include "log_mechanic.generated.hpp"

rust/src/c_interface.rs

Lines changed: 73 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@ use std::ffi::c_char;
22
use std::marker::PhantomData;
33
use std::str::Utf8Error;
44

5+
use crate::lexer::Capture;
6+
use crate::lexer::Fragment;
57
use crate::lexer::Lexer;
6-
use crate::lexer::LogComponent;
78
use crate::regex::Regex;
89
use crate::schema::Schema;
910

1011
#[repr(C)]
12+
#[derive(Debug, Clone, Copy)]
1113
pub struct CSlice<'lifetime, T> {
1214
pointer: *const T,
1315
length: usize,
@@ -16,14 +18,14 @@ pub struct CSlice<'lifetime, T> {
1618

1719
pub type CStringView<'lifetime> = CSlice<'lifetime, c_char>;
1820

19-
// #[unsafe(no_mangle)]
20-
// unsafe extern "C" fn clp_log_mechanic_c_string_view<'unknown>(pointer: *const c_char) -> CSlice<'unknown, c_char> {
21-
// CSlice {
22-
// pointer,
23-
// length: unsafe { libc::strlen(pointer) },
24-
// _lifetime: PhantomData,
25-
// }
26-
// }
21+
#[repr(C)]
22+
pub struct CLogFragment<'schema, 'input> {
23+
pub rule: usize,
24+
pub start: *const u8,
25+
pub end: *const u8,
26+
pub captures: *const Capture<'schema, 'input>,
27+
pub captures_count: usize,
28+
}
2729

2830
#[unsafe(no_mangle)]
2931
extern "C" fn clp_log_mechanic_schema_new() -> Box<Schema> {
@@ -53,28 +55,35 @@ unsafe extern "C" fn clp_log_mechanic_schema_add_rule(
5355
}
5456

5557
#[unsafe(no_mangle)]
56-
unsafe extern "C" fn clp_log_mechanic_lexer_new<'schema>(schema: &'schema Schema) -> Box<Lexer<'schema>> {
57-
let lexer: Lexer<'_> = Lexer::new(schema).unwrap();
58+
unsafe extern "C" fn clp_log_mechanic_lexer_new<'schema, 'a>(schema: &'schema Schema) -> Box<Lexer<'schema, 'a>> {
59+
let lexer: Lexer<'_, '_> = Lexer::new(schema).unwrap();
5860
Box::new(lexer)
5961
}
6062

6163
#[unsafe(no_mangle)]
62-
unsafe extern "C" fn clp_log_mechanic_lexer_delete(lexer: Box<Lexer<'_>>) {
64+
unsafe extern "C" fn clp_log_mechanic_lexer_delete<'a>(lexer: Box<Lexer<'_, 'a>>) {
6365
std::mem::drop(lexer);
6466
}
6567

68+
/// Very unsafe!
69+
///
70+
/// The returned [`CLogFragment`] includes a hidden exclusive borrow of `lexer`
71+
/// (it contains a pointer into an interal buffer of `lexer`),
72+
/// so it is nolonger valid/you must not use it after a subsequent exclusive borrow of `lexer`
73+
/// (i.e. this borrow has ended).
6674
#[unsafe(no_mangle)]
67-
unsafe extern "C" fn clp_log_mechanic_lexer_next_token<'schema, 'lexer>(
68-
lexer: &'lexer mut Lexer<'schema>,
69-
input: CStringView<'_>,
75+
unsafe extern "C" fn clp_log_mechanic_lexer_next_fragment<'schema, 'lexer, 'input>(
76+
lexer: &'lexer mut Lexer<'schema, 'input>,
77+
input: CStringView<'input>,
7078
pos: &mut usize,
71-
log_component: &mut LogComponent<'schema, 'lexer>,
72-
) -> bool {
73-
if let Some(component) = lexer.next_token(input.as_utf8().unwrap(), pos) {
74-
*log_component = component;
75-
true
76-
} else {
77-
false
79+
) -> CLogFragment<'schema, 'input> {
80+
let fragment: Fragment<'_, '_, '_> = lexer.next_fragment(input.as_utf8().unwrap(), pos);
81+
CLogFragment {
82+
rule: fragment.rule,
83+
start: fragment.lexeme.as_bytes().as_ptr_range().start,
84+
end: fragment.lexeme.as_bytes().as_ptr_range().end,
85+
captures: fragment.captures.as_ptr(),
86+
captures_count: fragment.captures.len(),
7887
}
7988
}
8089

@@ -98,3 +107,45 @@ impl<'lifetime> CStringView<'lifetime> {
98107
str::from_utf8(bytes)
99108
}
100109
}
110+
111+
#[cfg(test)]
112+
mod test {
113+
use super::*;
114+
use crate::regex::Regex;
115+
116+
#[test]
117+
fn basic() {
118+
let mut schema: Schema = Schema::new();
119+
schema.set_delimiters(" ");
120+
schema.add_rule("hello", Regex::from_pattern("hello world").unwrap());
121+
schema.add_rule("bye", Regex::from_pattern("goodbye").unwrap());
122+
123+
let mut lexer: Lexer<'_, '_> = Lexer::new(&schema).unwrap();
124+
let input: CStringView<'_> = CStringView::from_utf8("hello world goodbye hello world goodbye ");
125+
let mut pos: usize = 0;
126+
127+
unsafe {
128+
assert_eq!(
129+
clp_log_mechanic_lexer_next_fragment(&mut lexer, input, &mut pos).rule,
130+
1
131+
);
132+
assert_eq!(
133+
clp_log_mechanic_lexer_next_fragment(&mut lexer, input, &mut pos).rule,
134+
2
135+
);
136+
assert_eq!(
137+
clp_log_mechanic_lexer_next_fragment(&mut lexer, input, &mut pos).rule,
138+
1
139+
);
140+
assert_eq!(
141+
clp_log_mechanic_lexer_next_fragment(&mut lexer, input, &mut pos).rule,
142+
2
143+
);
144+
assert_eq!(
145+
clp_log_mechanic_lexer_next_fragment(&mut lexer, input, &mut pos).rule,
146+
0
147+
);
148+
}
149+
assert_eq!(pos, input.length);
150+
}
151+
}

rust/src/dfa.rs

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ pub struct Dfa<'schema> {
2323
number_of_registers: usize,
2424
}
2525

26+
#[derive(Debug, Clone)]
27+
pub struct GotRule<'input> {
28+
pub rule: usize,
29+
pub lexeme: &'input str,
30+
}
31+
2632
#[derive(Debug)]
2733
struct DfaState<'schema> {
2834
kernel: Kernel<'schema>,
@@ -98,14 +104,14 @@ impl<'schema> Dfa<'schema> {
98104
self.simulate_with_captures(input, |name, lexeme| {
99105
println!("- captured {name}, {lexeme}");
100106
})
101-
.is_some()
107+
.is_ok()
102108
}
103109

104110
pub fn simulate_with_captures<'input, F>(
105111
&self,
106112
input: &'input str,
107113
mut on_capture: F,
108-
) -> Option<(usize, &'input str)>
114+
) -> Result<GotRule<'input>, usize>
109115
where
110116
F: FnMut(&'schema str, &'input str),
111117
{
@@ -117,6 +123,8 @@ impl<'schema> Dfa<'schema> {
117123

118124
let mut maybe_backup: Option<(usize, usize, Vec<usize>)> = None;
119125

126+
let mut consumed: usize = 0;
127+
120128
for (i, ch) in input.char_indices() {
121129
println!("=== step {i} (state {current_state}), ch {ch:?} ({})", u32::from(ch));
122130
/*
@@ -130,6 +138,7 @@ impl<'schema> Dfa<'schema> {
130138
}
131139
*/
132140
if let Some(transition) = self.states[current_state].transitions.lookup(u32::from(ch)) {
141+
consumed = i;
133142
current_state = transition.destination;
134143
self.apply_operations(
135144
&mut registers,
@@ -138,22 +147,20 @@ impl<'schema> Dfa<'schema> {
138147
&transition.operations,
139148
&self.states[current_state].tag_for_register,
140149
);
150+
if self.states[current_state].is_final {
151+
maybe_backup = Some((consumed, current_state, registers.clone()));
152+
}
141153
} else {
142154
break;
143155
}
144-
if self.states[current_state].is_final {
145-
maybe_backup = Some((i, current_state, registers.clone()));
146-
}
147156
}
148157

149-
println!("ended at {current_state}");
150-
let (consumed, state, mut registers): (usize, usize, Vec<usize>) = maybe_backup?;
151-
println!("had final at {state}");
158+
let (consumed, state, mut registers): (usize, usize, Vec<usize>) = maybe_backup.ok_or(consumed)?;
152159

153160
self.apply_operations(
154161
&mut registers,
155162
&mut prefix_tree,
156-
consumed + 1,
163+
consumed,
157164
&self.states[state].final_operations,
158165
&self.states[state].tag_for_register,
159166
);
@@ -188,11 +195,14 @@ impl<'schema> Dfa<'schema> {
188195
maybe_rule = Some(var.rule);
189196
}
190197
for (&i, &j) in std::iter::zip(starts.iter(), ends.iter()) {
191-
on_capture(var.name, &input[i..j]);
198+
on_capture(var.name, &input[i..=j]);
192199
}
193200
}
194201

195-
return Some((maybe_rule.unwrap(), &input[..consumed + 1]));
202+
Ok(GotRule {
203+
rule: maybe_rule.unwrap(),
204+
lexeme: &input[..=consumed],
205+
})
196206
}
197207

198208
fn apply_operations(

0 commit comments

Comments
 (0)