diff --git a/Cargo.lock b/Cargo.lock index ef95572d..ccf5bd87 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1153,7 +1153,7 @@ dependencies = [ "indoc", "memmem", "rand", - "regex", + "regex-automata", "serde", "serde_json", "sliceslice", @@ -1170,7 +1170,7 @@ dependencies = [ "indoc", "libc", "num_enum", - "regex", + "regex-automata", "serde", "serde_json", "wirefilter-engine", diff --git a/Cargo.toml b/Cargo.toml index c47ad75f..fa4e9718 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ libc = "0.2.42" memmem = "0.1.1" num_enum = "0.7" rand = "0.8" -regex = { version = "1.3.6" } +regex-automata = { version = "0.4.9" } serde = { version = "1.0.113", features = [ "derive" ] } serde_json = "1.0.56" sliceslice = "0.4.3" diff --git a/engine/Cargo.toml b/engine/Cargo.toml index fb3082a6..e83dd984 100644 --- a/engine/Cargo.toml +++ b/engine/Cargo.toml @@ -26,7 +26,7 @@ cidr.workspace = true fnv.workspace = true memmem.workspace = true rand.workspace = true -regex = { workspace = true, optional = true } +regex-automata = { workspace = true, optional = true } serde.workspace = true serde_json.workspace = true sliceslice.workspace = true @@ -38,4 +38,5 @@ criterion.workspace = true indoc.workspace = true [features] +regex = ["dep:regex-automata"] default = [ "regex" ] diff --git a/engine/src/rhs_types/regex/imp_real.rs b/engine/src/rhs_types/regex/imp_real.rs index 750b7fa1..fbb87870 100644 --- a/engine/src/rhs_types/regex/imp_real.rs +++ b/engine/src/rhs_types/regex/imp_real.rs @@ -1,51 +1,92 @@ -use crate::{ParserSettings, RegexFormat}; +use regex_automata::MatchKind; -pub use regex::Error; +use super::Error; +use crate::{ParserSettings, RegexFormat}; +use std::ops::Deref; +use std::sync::Arc; -/// Wrapper around [`regex::bytes::Regex`] +/// Wrapper around [`regex_automata::meta::Regex`] #[derive(Clone)] pub struct Regex { - compiled_regex: regex::bytes::Regex, + pattern: Arc, + regex: regex_automata::meta::Regex, format: RegexFormat, } impl Regex { + /// Retrieves the syntax configuration that will be used to build the regex. + #[inline] + pub fn syntax_config() -> regex_automata::util::syntax::Config { + regex_automata::util::syntax::Config::new() + .unicode(false) + .utf8(false) + } + + /// Retrieves the meta configuration that will be used to build the regex. + #[inline] + pub fn meta_config(settings: &ParserSettings) -> regex_automata::meta::Config { + regex_automata::meta::Config::new() + .match_kind(MatchKind::LeftmostFirst) + .utf8_empty(false) + .dfa(false) + .nfa_size_limit(Some(settings.regex_compiled_size_limit)) + .onepass_size_limit(Some(settings.regex_compiled_size_limit)) + .dfa_size_limit(Some(settings.regex_compiled_size_limit)) + .hybrid_cache_capacity(settings.regex_dfa_size_limit) + } + /// Compiles a regular expression. pub fn new( pattern: &str, format: RegexFormat, settings: &ParserSettings, ) -> Result { - ::regex::bytes::RegexBuilder::new(pattern) - .unicode(false) - .size_limit(settings.regex_compiled_size_limit) - .dfa_size_limit(settings.regex_dfa_size_limit) - .build() - .map(|r| Regex { - compiled_regex: r, + ::regex_automata::meta::Builder::new() + .configure(Self::meta_config(settings)) + .syntax(Self::syntax_config()) + .build(pattern) + .map(|regex| Regex { + pattern: Arc::from(pattern), + regex, format, }) + .map_err(|err| { + if let Some(limit) = err.size_limit() { + Error::CompiledTooBig(limit) + } else if let Some(syntax) = err.syntax_error() { + Error::Syntax(syntax.to_string()) + } else { + unreachable!() + } + }) } - /// Returns true if and only if the regex matches the string given. - pub fn is_match(&self, text: &[u8]) -> bool { - self.compiled_regex.is_match(text) - } - - /// Returns the original string of this regex. + /// Returns the pattern of this regex. + #[inline] pub fn as_str(&self) -> &str { - self.compiled_regex.as_str() + &self.pattern } - /// Returns the format behind the regex + /// Returns the format used by the pattern. + #[inline] pub fn format(&self) -> RegexFormat { self.format } } -impl From for regex::bytes::Regex { +impl From for regex_automata::meta::Regex { + #[inline] fn from(regex: Regex) -> Self { - regex.compiled_regex + regex.regex + } +} + +impl Deref for Regex { + type Target = regex_automata::meta::Regex; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.regex } } diff --git a/engine/src/rhs_types/regex/mod.rs b/engine/src/rhs_types/regex/mod.rs index e5c84af6..804b749d 100644 --- a/engine/src/rhs_types/regex/mod.rs +++ b/engine/src/rhs_types/regex/mod.rs @@ -3,10 +3,9 @@ use crate::rhs_types::bytes::lex_raw_string_as_str; use crate::FilterParser; use cfg_if::cfg_if; use serde::{Serialize, Serializer}; -use std::{ - fmt::{self, Debug, Formatter}, - hash::{Hash, Hasher}, -}; +use std::fmt::{self, Debug, Display, Formatter}; +use std::hash::{Hash, Hasher}; +use thiserror::Error; cfg_if! { if #[cfg(feature = "regex")] { @@ -19,7 +18,7 @@ cfg_if! { } /// RegexFormat describes the format behind the regex -#[derive(PartialEq, Eq, Copy, Clone)] +#[derive(PartialEq, Eq, Copy, Clone, Debug)] pub enum RegexFormat { /// Literal string was used to define the expression Literal, @@ -41,9 +40,20 @@ impl Hash for Regex { } } +impl Display for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.as_str()) + } +} + impl Debug for Regex { + /// Shows the original regular expression. fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - f.write_str(self.as_str()) + f.debug_struct("Regex") + .field("pattern", &self.as_str()) + .field("format", &self.format()) + .finish() } } @@ -120,6 +130,28 @@ impl Serialize for Regex { } } +/// An error that occurred during parsing or compiling a regular expression. +#[non_exhaustive] +#[derive(Clone, Debug, Error, PartialEq)] +pub enum Error { + /// A syntax error. + Syntax(String), + /// The compiled regex exceeded the configured + /// regex compiled size limit. + CompiledTooBig(usize), +} + +impl Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + Error::Syntax(ref err) => Display::fmt(err, f), + Error::CompiledTooBig(limit) => { + write!(f, "Compiled regex exceeds size limit of {} bytes.", limit) + } + } + } +} + #[cfg(test)] mod test { use super::*; diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml index 50ed9244..e34e4cfa 100644 --- a/ffi/Cargo.toml +++ b/ffi/Cargo.toml @@ -25,7 +25,7 @@ wirefilter.workspace = true [dev-dependencies] indoc.workspace = true -regex.workspace = true +regex-automata.workspace = true [build-dependencies] cbindgen = "0.27" diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index 66d68671..f0bd74e9 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -1039,7 +1039,7 @@ pub extern "C" fn wirefilter_get_version() -> StaticRustAllocatedString { #[allow(clippy::bool_assert_comparison)] mod ffi_test { use super::*; - use regex::Regex; + use regex_automata::meta::Regex; use std::ffi::CStr; impl RustAllocatedString {