@@ -189,6 +189,9 @@ class Ex::Private
189189
190190 /* * The pattern string as passed by the user */
191191 std::string pattern;
192+
193+ /* * Number of capture groups in the pattern (excluding the whole match) */
194+ size_t captureCount = 0 ;
192195};
193196
194197/* * Compiles a regular expression passed as a string into a stream of tokens that can be used for
@@ -198,6 +201,7 @@ void Ex::Private::compile()
198201{
199202 error = false ;
200203 data.clear ();
204+ captureCount = 0 ;
201205 if (pattern.empty ()) return ;
202206 const char *start = pattern.c_str ();
203207 const char *ps = start;
@@ -206,6 +210,10 @@ void Ex::Private::compile()
206210 int prevTokenPos=-1 ;
207211 int tokenPos=0 ;
208212
213+ // capture group assignment
214+ std::vector<size_t > captureStack;
215+ size_t nextCaptureId = 0 ;
216+
209217 auto addToken = [&](PToken tok)
210218 {
211219 tokenPos++;
@@ -274,12 +282,27 @@ void Ex::Private::compile()
274282 addToken (PToken (PToken::Kind::Any));
275283 break ;
276284 case ' (' : // begin of capture group
277- prevTokenPos = tokenPos;
278- addToken (PToken (PToken::Kind::BeginCapture));
285+ {
286+ prevTokenPos = tokenPos;
287+ addToken (PToken (PToken::Kind::BeginCapture));
288+ size_t id = ++nextCaptureId; // groups start at 1, 0 is whole match
289+ data.back ().setValue (id);
290+ captureStack.push_back (id);
291+ }
279292 break ;
280293 case ' )' : // end of capture group
281- prevTokenPos = tokenPos;
282- addToken (PToken (PToken::Kind::EndCapture));
294+ {
295+ prevTokenPos = tokenPos;
296+ if (captureStack.empty ())
297+ {
298+ error=true ;
299+ return ;
300+ }
301+ size_t id = captureStack.back ();
302+ captureStack.pop_back ();
303+ addToken (PToken (PToken::Kind::EndCapture));
304+ data.back ().setValue (id);
305+ }
283306 break ;
284307 case ' [' : // character class
285308 {
@@ -402,6 +425,12 @@ void Ex::Private::compile()
402425 }
403426 ps++;
404427 }
428+ if (!captureStack.empty ()) // Unmatched '('?
429+ {
430+ error=true ;
431+ return ;
432+ }
433+ captureCount = nextCaptureId;
405434 // addToken(PToken(PToken::Kind::End));
406435}
407436
@@ -412,6 +441,7 @@ void Ex::Private::dump()
412441 size_t l = data.size ();
413442 size_t i =0 ;
414443 DBG (" ==== compiled token stream for pattern '%s' ===\n " ,pattern.c_str ());
444+ DBG (" captureCount=%zu\n " ,captureCount);
415445 while (i<l)
416446 {
417447 DBG (" [%s:%04x]\n " ,data[i].kindStr (),data[i].value ());
@@ -531,7 +561,7 @@ bool Ex::Private::matchAt(size_t tokenPos,size_t tokenLen,std::string_view str,M
531561 size_t tokenStart = ++tokenPos;
532562 while (tokenPos<tokenLen && data[tokenPos].kind ()!=PToken::Kind::EndCapture) { tokenPos++; }
533563 Match rangeMatch;
534- rangeMatch.init (str);
564+ rangeMatch.init (str, 0 );
535565 bool found = matchAt (tokenStart,tokenPos,str,rangeMatch,index,level+1 );
536566 if (found)
537567 {
@@ -614,12 +644,12 @@ bool Ex::Private::matchAt(size_t tokenPos,size_t tokenLen,std::string_view str,M
614644 (isIdChar (str[index]) || index==0 || !isIdChar (str[index-1 ]))) return false ;
615645 break ;
616646 case PToken::Kind::BeginCapture:
617- DBG (" BeginCapture(%zu)\n " ,index);
618- match.startCapture (index);
647+ DBG (" BeginCapture(%zu) gid=%u \n " ,index,tok. value () );
648+ match.startCapture (tok. value (), index);
619649 break ;
620650 case PToken::Kind::EndCapture:
621- DBG (" EndCapture(%zu)\n " ,index);
622- match.endCapture (index);
651+ DBG (" EndCapture(%zu) gid=%u \n " ,index,tok. value () );
652+ match.endCapture (tok. value (), index);
623653 break ;
624654 case PToken::Kind::Any:
625655 if (index>=str.length ()) return false ;
@@ -707,7 +737,7 @@ bool Ex::match(std::string_view str,Match &match,size_t pos) const
707737{
708738 bool found=false ;
709739 if (p->data .size ()==0 || p->error ) return found;
710- match.init (str);
740+ match.init (str,p-> captureCount );
711741
712742 PToken tok = p->data [0 ];
713743 if (tok.kind ()==PToken::Kind::BeginOfLine) // only test match at the given position
@@ -721,10 +751,10 @@ bool Ex::match(std::string_view str,Match &match,size_t pos) const
721751 size_t index = str.find (tok.asciiValue (),pos);
722752 if (index==std::string::npos)
723753 {
724- DBG (" Ex::match(str='%s',pos=%zu)=false (no start char '%c')\n " ,str.c_str (),pos,tok.asciiValue ());
754+ DBG (" Ex::match(str='%s',pos=%zu)=false (no start char '%c')\n " ,std::string ( str) .c_str (),pos,tok.asciiValue ());
725755 return false ;
726756 }
727- DBG (" pos=%zu str='%s' char='%c' index=%zu\n " ,index,str.c_str (),tok.asciiValue (),index);
757+ DBG (" pos=%zu str='%s' char='%c' index=%zu\n " ,index,std::string ( str) .c_str (),tok.asciiValue (),index);
728758 pos=index;
729759 }
730760 while (pos<str.length ()) // search for a match starting at pos
@@ -734,7 +764,7 @@ bool Ex::match(std::string_view str,Match &match,size_t pos) const
734764 pos++;
735765 }
736766 }
737- DBG (" Ex::match(str='%s',pos=%zu)=%d\n " ,str.c_str (),pos,found);
767+ DBG (" Ex::match(str='%s',pos=%zu)=%d\n " ,std::string ( str) .c_str (),pos,found);
738768 return found;
739769}
740770
0 commit comments