Skip to content

Commit ac9061a

Browse files
author
Jack Luo
committed
feat: add timestamp rules and event boundary detection
Add is_timestamp field to Schema::Rule and add_timestamp_rule() method. Add is_event_start field to Fragment, set to true when a timestamp rule matches at byte offset 0 or immediately after a newline. This enables downstream consumers to split multi-line log events. Also add schema_add_timestamp_rule FFI and is_event_start to CLogFragment.
1 parent bdfaa4b commit ac9061a

File tree

4 files changed

+82
-2
lines changed

4 files changed

+82
-2
lines changed

rust/include/log_mechanic.generated.hpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,21 @@ CLogFragment() = default;
6666
const Capture *captures;
6767
/// Number of captures.
6868
size_t captures_count;
69+
/// Whether this fragment starts a new log event (timestamp at start of line).
70+
bool is_event_start;
6971

7072
CLogFragment(size_t const& rule,
7173
const uint8_t *const& start,
7274
const uint8_t *const& end,
7375
const Capture *const& captures,
74-
size_t const& captures_count)
76+
size_t const& captures_count,
77+
bool const& is_event_start)
7578
: rule(rule),
7679
start(start),
7780
end(end),
7881
captures(captures),
79-
captures_count(captures_count)
82+
captures_count(captures_count),
83+
is_event_start(is_event_start)
8084
{}
8185

8286
};
@@ -98,6 +102,10 @@ CLogFragment clp_log_mechanic_lexer_next_fragment(Lexer *lexer, CStringView inpu
98102

99103
bool clp_log_mechanic_schema_add_rule(Schema *schema, CStringView name, CStringView pattern);
100104

105+
bool clp_log_mechanic_schema_add_timestamp_rule(Schema *schema,
106+
CStringView name,
107+
CStringView pattern);
108+
101109
void clp_log_mechanic_schema_delete(Box<Schema> schema);
102110

103111
Box<Schema> clp_log_mechanic_schema_new();

rust/src/c_interface.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ pub struct CLogFragment<'schema, 'input, 'buffer> {
3030
pub captures: *const Capture<'schema, 'input>,
3131
/// Number of captures.
3232
pub captures_count: usize,
33+
/// Whether this fragment starts a new log event (timestamp at start of line).
34+
pub is_event_start: bool,
3335
/// Indicates that `captures` points into (borrows from) some external `'buffer`.
3436
pub _captures_lifetime: PhantomData<&'buffer [Capture<'schema, 'input>]>,
3537
}
@@ -77,6 +79,25 @@ unsafe extern "C" fn clp_log_mechanic_schema_add_rule(
7779
true
7880
}
7981

82+
#[unsafe(no_mangle)]
83+
unsafe extern "C" fn clp_log_mechanic_schema_add_timestamp_rule(
84+
schema: &mut Schema,
85+
name: CStringView<'_>,
86+
pattern: CStringView<'_>,
87+
) -> bool {
88+
let Ok(name) = name.as_utf8() else {
89+
return false;
90+
};
91+
let Ok(pattern) = pattern.as_utf8() else {
92+
return false;
93+
};
94+
let Ok(regex) = Regex::from_pattern(pattern) else {
95+
return false;
96+
};
97+
schema.add_timestamp_rule(name, regex);
98+
true
99+
}
100+
80101
#[unsafe(no_mangle)]
81102
unsafe extern "C" fn clp_log_mechanic_schema_rule_count(schema: &Schema) -> usize {
82103
schema.rules().len()
@@ -121,6 +142,7 @@ unsafe extern "C" fn clp_log_mechanic_lexer_next_fragment<'schema, 'lexer, 'inpu
121142
end: fragment.lexeme.as_bytes().as_ptr_range().end,
122143
captures: fragment.captures.as_ptr(),
123144
captures_count: fragment.captures.len(),
145+
is_event_start: fragment.is_event_start,
124146
_captures_lifetime: PhantomData,
125147
}
126148
}

rust/src/lexer.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ pub struct Fragment<'schema, 'input, 'buffer> {
1515
pub rule: usize,
1616
pub lexeme: &'input str,
1717
pub captures: &'buffer [Capture<'schema, 'input>],
18+
pub is_event_start: bool,
1819
}
1920

2021
#[repr(C)]
@@ -54,10 +55,14 @@ impl<'schema, 'input> Lexer<'schema, 'input> {
5455
continue;
5556
}
5657
}
58+
let at_line_start = static_text_end == 0
59+
|| input.as_bytes().get(static_text_end - 1) == Some(&b'\n');
60+
let is_event_start = at_line_start && self.schema.rules()[rule].is_timestamp;
5761
return Fragment {
5862
rule,
5963
lexeme,
6064
captures: &self.captures_buffer,
65+
is_event_start,
6166
};
6267
},
6368
Err(consumed) => {
@@ -71,6 +76,7 @@ impl<'schema, 'input> Lexer<'schema, 'input> {
7176
rule: 0,
7277
lexeme: &input[*pos..],
7378
captures: &[],
79+
is_event_start: false,
7480
}
7581
}
7682

@@ -113,6 +119,34 @@ mod test {
113119
assert_eq!(pos, input.len());
114120
}
115121

122+
#[test]
123+
fn event_boundary() {
124+
let mut schema: Schema = Schema::new();
125+
schema.set_delimiters(" \n");
126+
schema.add_timestamp_rule("ts", Regex::from_pattern("[0-9][0-9][0-9][0-9]\\-[0-9][0-9]\\-[0-9][0-9]").unwrap());
127+
schema.add_rule("word", Regex::from_pattern("[a-zA-Z]+").unwrap());
128+
129+
let mut lexer: Lexer<'_, '_> = Lexer::new(&schema).unwrap();
130+
let input: &str = "2024-01-15 hello\n2024-01-16 world\n";
131+
let mut pos: usize = 0;
132+
133+
let f1 = lexer.next_fragment(input, &mut pos);
134+
assert_eq!(f1.rule, 1); // ts
135+
assert!(f1.is_event_start); // at start of input
136+
137+
let f2 = lexer.next_fragment(input, &mut pos);
138+
assert_eq!(f2.rule, 2); // word "hello"
139+
assert!(!f2.is_event_start);
140+
141+
let f3 = lexer.next_fragment(input, &mut pos);
142+
assert_eq!(f3.rule, 1); // ts
143+
assert!(f3.is_event_start); // after \n
144+
145+
let f4 = lexer.next_fragment(input, &mut pos);
146+
assert_eq!(f4.rule, 2); // word "world"
147+
assert!(!f4.is_event_start);
148+
}
149+
116150
#[test]
117151
fn capture_boundaries() {
118152
let mut schema: Schema = Schema::new();

rust/src/schema.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ pub struct Rule {
1111
pub idx: usize,
1212
pub name: String,
1313
pub regex: Regex,
14+
pub is_timestamp: bool,
1415
}
1516

1617
impl Schema {
@@ -22,6 +23,7 @@ impl Schema {
2223
idx: 0,
2324
name: "static".to_owned(),
2425
regex: Self::pattern_for_delimiters(Self::DEFAULT_DELIMITERS),
26+
is_timestamp: false,
2527
}],
2628
delimiters: Self::DEFAULT_DELIMITERS.to_owned(),
2729
}
@@ -41,6 +43,20 @@ impl Schema {
4143
idx,
4244
name: name.into(),
4345
regex,
46+
is_timestamp: false,
47+
});
48+
}
49+
50+
pub fn add_timestamp_rule<LikeString>(&mut self, name: LikeString, regex: Regex)
51+
where
52+
LikeString: Into<String>,
53+
{
54+
let idx: usize = self.rules.len();
55+
self.rules.push(Rule {
56+
idx,
57+
name: name.into(),
58+
regex,
59+
is_timestamp: true,
4460
});
4561
}
4662

0 commit comments

Comments
 (0)