@@ -72,6 +72,7 @@ SQLITE_EXTENSION_INIT1
7272
7373/* The end-of-input character */
7474#define RE_EOF 0 /* End of input */
75+ #define RE_START 0xfffffff /* Start of input - larger than an UTF-8 */
7576
7677/* The NFA is implemented as sequence of opcodes taken from the following
7778** set. Each opcode has a single integer argument.
@@ -93,6 +94,33 @@ SQLITE_EXTENSION_INIT1
9394#define RE_OP_SPACE 15 /* space: [ \t\n\r\v\f] */
9495#define RE_OP_NOTSPACE 16 /* Not a digit */
9596#define RE_OP_BOUNDARY 17 /* Boundary between word and non-word */
97+ #define RE_OP_ATSTART 18 /* Currently at the start of the string */
98+
99+ #if defined(SQLITE_DEBUG )
100+ /* Opcode names used for symbolic debugging */
101+ static const char * ReOpName [] = {
102+ "EOF" ,
103+ "MATCH" ,
104+ "ANY" ,
105+ "ANYSTAR" ,
106+ "FORK" ,
107+ "GOTO" ,
108+ "ACCEPT" ,
109+ "CC_INC" ,
110+ "CC_EXC" ,
111+ "CC_VALUE" ,
112+ "CC_RANGE" ,
113+ "WORD" ,
114+ "NOTWORD" ,
115+ "DIGIT" ,
116+ "NOTDIGIT" ,
117+ "SPACE" ,
118+ "NOTSPACE" ,
119+ "BOUNDARY" ,
120+ "ATSTART" ,
121+ };
122+ #endif /* SQLITE_DEBUG */
123+
96124
97125/* Each opcode is a "state" in the NFA */
98126typedef unsigned short ReStateNumber ;
@@ -127,7 +155,7 @@ struct ReCompiled {
127155 int * aArg ; /* Arguments to each operator */
128156 unsigned (* xNextChar )(ReInput * ); /* Next character function */
129157 unsigned char zInit [12 ]; /* Initial text to match */
130- int nInit ; /* Number of characters in zInit */
158+ int nInit ; /* Number of bytes in zInit */
131159 unsigned nState ; /* Number of entries in aOp[] and aArg[] */
132160 unsigned nAlloc ; /* Slots allocated for aOp[] and aArg[] */
133161};
@@ -200,7 +228,7 @@ static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
200228 ReStateNumber * pToFree ;
201229 unsigned int i = 0 ;
202230 unsigned int iSwap = 0 ;
203- int c = RE_EOF + 1 ;
231+ int c = RE_START ;
204232 int cPrev = 0 ;
205233 int rc = 0 ;
206234 ReInput in ;
@@ -219,6 +247,7 @@ static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
219247 in .i ++ ;
220248 }
221249 if ( in .i + pRe -> nInit > in .mx ) return 0 ;
250+ c = RE_START - 1 ;
222251 }
223252
224253 if ( pRe -> nState <=(sizeof (aSpace )/(sizeof (aSpace [0 ])* 2 )) ){
@@ -247,6 +276,10 @@ static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
247276 if ( pRe -> aArg [x ]== c ) re_add_state (pNext , x + 1 );
248277 break ;
249278 }
279+ case RE_OP_ATSTART : {
280+ if ( cPrev == RE_START ) re_add_state (pThis , x + 1 );
281+ break ;
282+ }
250283 case RE_OP_ANY : {
251284 if ( c != 0 ) re_add_state (pNext , x + 1 );
252285 break ;
@@ -328,7 +361,9 @@ static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
328361 }
329362 }
330363 for (i = 0 ; i < pNext -> nState ; i ++ ){
331- if ( pRe -> aOp [pNext -> aState [i ]]== RE_OP_ACCEPT ){ rc = 1 ; break ; }
364+ int x = pNext -> aState [i ];
365+ while ( pRe -> aOp [x ]== RE_OP_GOTO ) x += pRe -> aArg [x ];
366+ if ( pRe -> aOp [x ]== RE_OP_ACCEPT ){ rc = 1 ; break ; }
332367 }
333368re_match_end :
334369 sqlite3_free (pToFree );
@@ -483,7 +518,6 @@ static const char *re_subcompile_string(ReCompiled *p){
483518 iStart = p -> nState ;
484519 switch ( c ){
485520 case '|' :
486- case '$' :
487521 case ')' : {
488522 p -> sIn .i -- ;
489523 return 0 ;
@@ -520,6 +554,14 @@ static const char *re_subcompile_string(ReCompiled *p){
520554 re_insert (p , iPrev , RE_OP_FORK , p -> nState - iPrev + 1 );
521555 break ;
522556 }
557+ case '$' : {
558+ re_append (p , RE_OP_MATCH , RE_EOF );
559+ break ;
560+ }
561+ case '^' : {
562+ re_append (p , RE_OP_ATSTART , 0 );
563+ break ;
564+ }
523565 case '{' : {
524566 int m = 0 , n = 0 ;
525567 int sz , j ;
@@ -538,6 +580,7 @@ static const char *re_subcompile_string(ReCompiled *p){
538580 if ( m == 0 ){
539581 if ( n == 0 ) return "both m and n are zero in '{m,n}'" ;
540582 re_insert (p , iPrev , RE_OP_FORK , sz + 1 );
583+ iPrev ++ ;
541584 n -- ;
542585 }else {
543586 for (j = 1 ; j < m ; j ++ ) re_copy (p , iPrev , sz );
@@ -656,11 +699,7 @@ static const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
656699 re_free (pRe );
657700 return zErr ;
658701 }
659- if ( rePeek (pRe )== '$' && pRe -> sIn .i + 1 >=pRe -> sIn .mx ){
660- re_append (pRe , RE_OP_MATCH , RE_EOF );
661- re_append (pRe , RE_OP_ACCEPT , 0 );
662- * ppRe = pRe ;
663- }else if ( pRe -> sIn .i >=pRe -> sIn .mx ){
702+ if ( pRe -> sIn .i >=pRe -> sIn .mx ){
664703 re_append (pRe , RE_OP_ACCEPT , 0 );
665704 * ppRe = pRe ;
666705 }else {
@@ -744,6 +783,67 @@ static void re_sql_func(
744783 }
745784}
746785
786+ #if defined(SQLITE_DEBUG )
787+ /*
788+ ** This function is used for testing and debugging only. It is only available
789+ ** if the SQLITE_DEBUG compile-time option is used.
790+ **
791+ ** Compile a regular expression and then convert the compiled expression into
792+ ** text and return that text.
793+ */
794+ static void re_bytecode_func (
795+ sqlite3_context * context ,
796+ int argc ,
797+ sqlite3_value * * argv
798+ ){
799+ const char * zPattern ;
800+ const char * zErr ;
801+ ReCompiled * pRe ;
802+ sqlite3_str * pStr ;
803+ int i ;
804+ int n ;
805+ char * z ;
806+
807+ zPattern = (const char * )sqlite3_value_text (argv [0 ]);
808+ if ( zPattern == 0 ) return ;
809+ zErr = re_compile (& pRe , zPattern , sqlite3_user_data (context )!= 0 );
810+ if ( zErr ){
811+ re_free (pRe );
812+ sqlite3_result_error (context , zErr , -1 );
813+ return ;
814+ }
815+ if ( pRe == 0 ){
816+ sqlite3_result_error_nomem (context );
817+ return ;
818+ }
819+ pStr = sqlite3_str_new (0 );
820+ if ( pStr == 0 ) goto re_bytecode_func_err ;
821+ if ( pRe -> nInit > 0 ){
822+ sqlite3_str_appendf (pStr , "INIT " );
823+ for (i = 0 ; i < pRe -> nInit ; i ++ ){
824+ sqlite3_str_appendf (pStr , "%02x" , pRe -> zInit [i ]);
825+ }
826+ sqlite3_str_appendf (pStr , "\n" );
827+ }
828+ for (i = 0 ; (unsigned )i < pRe -> nState ; i ++ ){
829+ sqlite3_str_appendf (pStr , "%-8s %4d\n" ,
830+ ReOpName [(unsigned char )pRe -> aOp [i ]], pRe -> aArg [i ]);
831+ }
832+ n = sqlite3_str_length (pStr );
833+ z = sqlite3_str_finish (pStr );
834+ if ( n == 0 ){
835+ sqlite3_free (z );
836+ }else {
837+ sqlite3_result_text (context , z , n - 1 , sqlite3_free );
838+ }
839+
840+ re_bytecode_func_err :
841+ re_free (pRe );
842+ }
843+
844+ #endif /* SQLITE_DEBUG */
845+
846+
747847/*
748848** Invoke this routine to register the regexp() function with the
749849** SQLite database connection.
@@ -768,6 +868,13 @@ int sqlite3_regexp_init(
768868 rc = sqlite3_create_function (db , "regexpi" , 2 ,
769869 SQLITE_UTF8 |SQLITE_INNOCUOUS |SQLITE_DETERMINISTIC ,
770870 (void * )db , re_sql_func , 0 , 0 );
871+ #if defined(SQLITE_DEBUG )
872+ if ( rc == SQLITE_OK ){
873+ rc = sqlite3_create_function (db , "regexp_bytecode" , 1 ,
874+ SQLITE_UTF8 |SQLITE_INNOCUOUS |SQLITE_DETERMINISTIC ,
875+ 0 , re_bytecode_func , 0 , 0 );
876+ }
877+ #endif /* SQLITE_DEBUG */
771878 }
772879 return rc ;
773880}
0 commit comments