@@ -6,6 +6,7 @@ use std::num::NonZeroU64;
66use std:: thread;
77
88use fancy_regex:: Regex ;
9+ use fancy_regex:: RegexBuilder ;
910use pyo3:: exceptions;
1011use pyo3:: prelude:: * ;
1112use pyo3:: pyclass;
@@ -417,7 +418,7 @@ impl CoreBPE {
417418 special_tokens_encoder : HashMap < String , Rank > ,
418419 pattern : & str ,
419420 ) -> PyResult < Self > {
420- let regex = Regex :: new ( pattern)
421+ let regex = RegexBuilder :: new ( pattern) . backtrack_limit ( 10_000 ) . build ( )
421422 . map_err ( |e| PyErr :: new :: < exceptions:: PyValueError , _ > ( e. to_string ( ) ) ) ?;
422423
423424 let special_regex = {
@@ -572,6 +573,7 @@ fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> {
572573
573574#[ cfg( test) ]
574575mod tests {
576+ use fancy_regex:: RegexBuilder ;
575577 use rustc_hash:: FxHashMap as HashMap ;
576578
577579 use crate :: { byte_pair_split, Rank } ;
@@ -596,4 +598,16 @@ mod tests {
596598 let res = byte_pair_split ( b"abab" , & ranks) ;
597599 assert_eq ! ( res, vec![ b"ab" , b"ab" ] ) ;
598600 }
601+
602+ #[ test]
603+ fn test_effect_of_backtrack_limit ( ) {
604+ let regex = RegexBuilder :: new ( r"(a|b|ab)*(?=c)" )
605+ . backtrack_limit ( 10 )
606+ . build ( )
607+ . expect ( "Failed to build regex" )
608+ . clone ( ) ;
609+
610+ let input = "ab" . repeat ( 100 ) + "c" ;
611+ assert ! ( regex. is_match( & input) . is_err( ) , "Should throw" ) ;
612+ }
599613}
0 commit comments