@@ -41,17 +41,19 @@ struct analysis_info {
4141 fsm_state_t start ; /* start state */
4242
4343 /* The states with a /./ self edge representing the unanchored
44- * start and end, or NO_STATE. There can be at most one of each. */
44+ * start and end, or LINKAGE_NO_STATE. There can be at most one
45+ * of each. Copied from linkage_info. */
4546 fsm_state_t unanchored_start_loop ;
4647 fsm_state_t unanchored_end_loop ;
4748
48- /* The end state following the unanchored end loop. */
49+ /* The end state following the unanchored end loop.
50+ * Copied from linkage_info.*/
4951 fsm_state_t unanchored_end_loop_end ;
5052
51- /* States that link to paths only reachable from the beginning of input. */
53+ /* States that link to paths only reachable from the beginning of input.
54+ * Copied from linkage_info. */
5255 struct state_set * anchored_starts ;
53-
54- /* States leading to an anchored end. */
56+ /* States leading to an anchored end. Copied from linkage_info. */
5557 struct state_set * anchored_ends ;
5658
5759 /* States with an outgoing labeled edge to the unanchored end loop. Input
@@ -213,25 +215,6 @@ fsm_union_array(size_t fsm_count,
213215 return res ;
214216}
215217
216- static bool
217- state_has_dot_self_edge (const struct fsm * nfa , fsm_state_t s_i )
218- {
219- const struct fsm_state * s = & nfa -> states [s_i ];
220-
221- struct edge_group_iter ei ;
222- edge_set_group_iter_reset (s -> edges , EDGE_GROUP_ITER_ALL , & ei );
223- struct edge_group_iter_info info ;
224- while (edge_set_group_iter_next (& ei , & info )) {
225- if (info .to != s_i ) { continue ; }
226- for (size_t i = 0 ; i < 256 /64 ; i ++ ) {
227- if (info .symbols [i ] != (uint64_t )-1 ) { continue ; }
228- }
229- return true;
230- }
231-
232- return false;
233- }
234-
235218#if LOG_ANALYZE_GROUP_NFA_RESULTS
236219static void
237220dump_state_set (FILE * f , const char * name , const struct state_set * set )
@@ -264,63 +247,6 @@ dump_edge_set(FILE *f, const char *name, fsm_state_t from, const struct edge_set
264247}
265248#endif
266249
267- /* For each state in the epsilon closure, if there's a labeled edge
268- * to an end state with no outgoing edges, check if the label set is
269- * only [\n] and there's also an epsilon edge to the same end state.
270- * If so, this represents an anchored end in the NFA. */
271- static bool
272- state_has_epsilon_and_newline_edges_to_same_end (const struct fsm * nfa ,
273- fsm_state_t s_id , struct state_set * s_eclosure , fsm_state_t * dst_end )
274- {
275- struct state_iter si ;
276- state_set_reset (s_eclosure , & si );
277- fsm_state_t ns_i ;
278- while (state_set_next (& si , & ns_i )) {
279- assert (ns_i < nfa -> statecount );
280- const struct fsm_state * ns = & nfa -> states [ns_i ];
281-
282- if (state_set_empty (ns -> epsilons )) { continue ; }
283- if (edge_set_empty (ns -> edges )) { continue ; }
284-
285- struct edge_group_iter iter ;
286- struct edge_group_iter_info info ;
287- edge_set_group_iter_reset (ns -> edges , EDGE_GROUP_ITER_ALL , & iter );
288- while (edge_set_group_iter_next (& iter , & info )) {
289- /* Look for an edge set with only '\n' */
290- if ((info .symbols [0 ] != (1ULL << '\n' ))
291- || info .symbols [1 ] || info .symbols [2 ] || info .symbols [3 ]) {
292- continue ;
293- }
294-
295- /* If it's an end, look for an epsilon leeding to the same destination */
296- if (fsm_isend (nfa , info .to )) {
297- assert (info .to < nfa -> statecount );
298- const struct fsm_state * end_candidate = & nfa -> states [info .to ];
299- if (!state_set_empty (end_candidate -> epsilons ) ||
300- !edge_set_empty (end_candidate -> edges )) {
301- continue ; /* not an anchored end */
302- }
303-
304- struct state_iter inner_si ;
305- fsm_state_t os_i ;
306-
307- assert (s_id < nfa -> statecount );
308- const struct fsm_state * s = & nfa -> states [s_id ];
309-
310- state_set_reset (s -> epsilons , & inner_si );
311- while (state_set_next (& inner_si , & os_i )) {
312- if (os_i == info .to ) {
313- * dst_end = info .to ;
314- return true;
315- }
316- }
317- }
318- }
319- }
320-
321- return false;
322- }
323-
324250static bool
325251state_has_labeled_edge_to_eclosure_with_unanchored_end_loop (const struct fsm * nfa ,
326252 fsm_state_t s_i , struct state_set * * eclosures ,
@@ -394,11 +320,7 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo)
394320 }
395321 }
396322
397- memset (ainfo , 0x00 , sizeof (* ainfo ));
398323 ainfo -> start = NO_STATE ;
399- ainfo -> unanchored_start_loop = NO_STATE ;
400- ainfo -> unanchored_end_loop = NO_STATE ;
401- ainfo -> unanchored_end_loop_end = NO_STATE ;
402324 ainfo -> eager_match_state = NO_STATE ;
403325
404326 if (!fsm_getstart (nfa , & ainfo -> start )) {
@@ -445,80 +367,6 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo)
445367 }
446368 }
447369
448- /* Iterate over the start state's epsilon edges, attempting to
449- * identify the unanchored start loop and anchored start states
450- * (if present).
451- *
452- * Note: This uses the start state's epsilon set rather than its
453- * epsilon closure because (by construction) the unanchored
454- * start loop and anchored start states will both be directly
455- * connected to the start state. Using the epsilon closure can
456- * mis-identify the unanchored *end* loop as the start loop, if
457- * there is a path with only epsilon edges between them. */
458- struct state_iter si ;
459- state_set_reset (nfa -> states [ainfo -> start ].epsilons , & si );
460- fsm_state_t ns_i ;
461- while (state_set_next (& si , & ns_i )) {
462- /* Ignore self edges. */
463- if (ns_i == ainfo -> start ) { continue ; }
464-
465- /* If there's a state in the start state's epsilon set that
466- * has a dot self-edge, it's the unanchored start loop. */
467- if (state_has_dot_self_edge (nfa , ns_i )) {
468- if (LOG_ANALYZE_GROUP_NFA ) {
469- fprintf (stderr , "%s: unanchored_start_loop found on state %d\n" , __func__ , ns_i );
470- }
471-
472- /* By construction, the true unanchored start loop is only reachable
473- * via an epsilon edge from the start state, so if any other state
474- * has an epsilon or labeled edge to this one, it cannot be the
475- * unanchored start loop.
476- *
477- * This is necessary for cases like '^|x', which produces:
478- *
479- * 0 -> 2;
480- * 0 -> 3;
481- * 2 -> 2 "\x00" .. "\xff";
482- * 2 -> 3 "x";
483- * 3 -> 1;
484- * 3 -> 3 ?;
485- *
486- * start: 0;
487- * end: 1 = [0];
488- *
489- * where this analysis would otherwise identify both 2 (correct)
490- * and 3 (incorrect) as the unanchored start loop. Both are reachable
491- * from the start state via an epsilon edge, but the labeled edge
492- * 2->3 'x' rules 3 out.
493- * */
494- if (state_set_contains (ainfo -> reachable_from_nonstart_state , ns_i )) {
495- continue ;
496- }
497-
498- /* The reachable_from_nonstart_state check handles the other cases,
499- * but for `$|^` other attempts to distinguish them will fail,
500- * but by construction the USL will have the earlier state ID. */
501- if (ainfo -> unanchored_start_loop != NO_STATE &&
502- ainfo -> unanchored_start_loop < ns_i ) {
503- continue ;
504- }
505-
506- ainfo -> unanchored_start_loop = ns_i ;
507- continue ;
508- } else {
509- /* Otherwise, a state without a dot self-edge is an anchored start.
510- * There may be more than one. */
511- if (LOG_ANALYZE_GROUP_NFA ) {
512- fprintf (stderr , "%s: anchored_start found on state %d\n" , __func__ , ns_i );
513- }
514-
515- if (!state_set_add (& ainfo -> anchored_starts , nfa -> alloc , ns_i )) {
516- goto alloc_fail ;
517- }
518- continue ;
519- }
520- }
521-
522370 /* Copy labeled edges from the unanchored start loop and
523371 * its epsilon closure to ainfo->repeatable_firsts, except
524372 * for edges leading back to the unanchored start loop. */
@@ -576,35 +424,8 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo)
576424 * trivially match, but otherwise it would never match. */
577425 ainfo -> nullable = start_state_epsilon_closure_matches_empty_string (nfa , eclosures [ainfo -> start ]);
578426
579- /* If there's a state with a dot self-edge and an epsilon edge to an end state, it's
580- * the unanchored end loop. There should only be one. */
581- for (size_t s_i = 0 ; s_i < state_count ; s_i ++ ) {
582- const struct fsm_state * s = & nfa -> states [s_i ];
583- if (state_has_dot_self_edge (nfa , s_i )) {
584- struct state_iter si ;
585- state_set_reset (s -> epsilons , & si );
586- fsm_state_t ns_i ;
587- while (state_set_next (& si , & ns_i )) {
588- if (fsm_isend (nfa , ns_i )) {
589- assert (ainfo -> unanchored_end_loop == NO_STATE );
590- ainfo -> unanchored_end_loop = s_i ;
591- ainfo -> unanchored_end_loop_end = ns_i ;
592- break ;
593- }
594- }
595- if (ainfo -> unanchored_end_loop != NO_STATE ) { break ; }
596- }
597- }
598-
599427 /* Collect states that lead to an anchored end or eager match. */
600428 for (size_t s_i = 0 ; s_i < state_count ; s_i ++ ) {
601- fsm_state_t dst_end = NO_STATE ;
602- if (state_has_epsilon_and_newline_edges_to_same_end (nfa , s_i , eclosures [s_i ], & dst_end )) {
603- if (!state_set_add (& ainfo -> anchored_ends , nfa -> alloc , dst_end )) {
604- goto alloc_fail ;
605- }
606- }
607-
608429 fsm_state_t indirect_dst = NO_STATE ;
609430 if (state_has_labeled_edge_to_eclosure_with_unanchored_end_loop (nfa , s_i , eclosures , ainfo -> unanchored_end_loop , & indirect_dst )) {
610431 if (!state_set_add (& ainfo -> eager_matches , nfa -> alloc , s_i )) {
@@ -619,57 +440,6 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo)
619440 }
620441 }
621442
622- /* Compare/log the linkage info */
623- #define COMPARE_LINKAGE_INFO 1
624- #define LOG_LINKAGE_INFO 1
625- if (LOG_LINKAGE_INFO ) {
626- struct state_iter si ;
627- state_set_reset (ainfo -> anchored_starts , & si );
628- fsm_state_t s_i ;
629-
630- fprintf (stderr , "ainfo->anchored_starts count: %zd\n" , state_set_count (ainfo -> anchored_starts ));
631- state_set_reset (ainfo -> anchored_starts , & si );
632- while (state_set_next (& si , & s_i )) {
633- fprintf (stderr , "ainfo->anchored_starts: %d\n" , s_i );
634- }
635-
636- fprintf (stderr , "linkage_info->anchored_starts count: %zd\n" , state_set_count (nfa -> linkage_info -> anchored_starts ));
637- state_set_reset (nfa -> linkage_info -> anchored_starts , & si );
638- while (state_set_next (& si , & s_i )) {
639- fprintf (stderr , "linkage_info->anchored_starts: %d\n" , s_i );
640- }
641-
642- assert (state_set_count (nfa -> linkage_info -> anchored_starts ) >= state_set_count (ainfo -> anchored_starts ));
643- state_set_reset (ainfo -> anchored_starts , & si );
644- while (state_set_next (& si , & s_i )) {
645- assert (state_set_contains (nfa -> linkage_info -> anchored_starts , s_i ));
646- }
647- }
648-
649- if (LOG_LINKAGE_INFO ) {
650- struct state_iter si ;
651- state_set_reset (ainfo -> anchored_ends , & si );
652- fsm_state_t s_i ;
653-
654- fprintf (stderr , "ainfo->anchored_ends count: %zd\n" , state_set_count (ainfo -> anchored_ends ));
655- state_set_reset (ainfo -> anchored_ends , & si );
656- while (state_set_next (& si , & s_i )) {
657- fprintf (stderr , "ainfo->anchored_ends: %d\n" , s_i );
658- }
659-
660- fprintf (stderr , "linkage_info->anchored_ends count: %zd\n" , state_set_count (nfa -> linkage_info -> anchored_ends ));
661- state_set_reset (nfa -> linkage_info -> anchored_ends , & si );
662- while (state_set_next (& si , & s_i )) {
663- fprintf (stderr , "linkage_info->anchored_ends: %d\n" , s_i );
664- }
665-
666- assert (state_set_count (nfa -> linkage_info -> anchored_ends ) >= state_set_count (ainfo -> anchored_ends ));
667- state_set_reset (ainfo -> anchored_ends , & si );
668- while (state_set_next (& si , & s_i )) {
669- assert (state_set_contains (nfa -> linkage_info -> anchored_ends , s_i ));
670- }
671- }
672-
673443#if LOG_ANALYZE_GROUP_NFA_RESULTS
674444 {
675445 fprintf (stderr , "# analysis_info start %d, usl %d, uel %d, uele %d\n" ,
@@ -682,27 +452,9 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo)
682452
683453 closure_free (nfa , eclosures , state_count );
684454
685- if (COMPARE_LINKAGE_INFO ) {
686- /* Check that the analysis and saved linkage_info from ast_compile.c match */
687- fprintf (stderr , "%s: checking that build-time data matches... usl %d, %d; uel %d, %d; uele %d, %d\n" ,
688- __func__ ,
689- nfa -> linkage_info -> unanchored_start_loop , ainfo -> unanchored_start_loop ,
690- nfa -> linkage_info -> unanchored_end_loop , ainfo -> unanchored_end_loop ,
691- nfa -> linkage_info -> unanchored_end_loop_end , ainfo -> unanchored_end_loop_end );
692-
693- if (nfa -> linkage_info -> unanchored_start_loop != ainfo -> unanchored_start_loop ) {
694- fprintf (stderr , "DISAGREEMENT, overriding\n" );
695- ainfo -> unanchored_start_loop = nfa -> linkage_info -> unanchored_start_loop ;
696- }
697-
698- assert (nfa -> linkage_info -> unanchored_start_loop == ainfo -> unanchored_start_loop );
699- assert (nfa -> linkage_info -> unanchored_end_loop == ainfo -> unanchored_end_loop );
700- assert (nfa -> linkage_info -> unanchored_end_loop_end == ainfo -> unanchored_end_loop_end );
701- }
702-
703455 /* The unanchored start and end loop cannot be the same state. */
704- assert (nfa -> linkage_info -> unanchored_start_loop == NO_STATE
705- || nfa -> linkage_info -> unanchored_start_loop != nfa -> linkage_info -> unanchored_end_loop );
456+ assert (ainfo -> unanchored_start_loop == NO_STATE
457+ || ainfo -> unanchored_start_loop != ainfo -> unanchored_end_loop );
706458
707459 return true;
708460
@@ -912,8 +664,6 @@ rebase_analysis_info(struct analysis_info *ainfo, fsm_state_t base)
912664static void
913665free_analysis (const struct fsm_alloc * alloc , struct analysis_info * ainfo )
914666{
915- state_set_free (ainfo -> anchored_ends );
916- state_set_free (ainfo -> anchored_starts );
917667 state_set_free (ainfo -> eager_matches );
918668 state_set_free (ainfo -> needs_indirect_epsilon_edge_to_eager_match_state );
919669 state_set_free (ainfo -> reachable_from_nonstart_state );
@@ -958,8 +708,26 @@ fsm_union_repeated_pattern_group(size_t nfa_count,
958708 for (size_t i = 0 ; i < nfa_count ; i ++ ) {
959709 struct fsm * fsm = nfas [i ];
960710
711+ struct analysis_info * ainfo = & ainfos [i ];
712+
713+ /* Copy these fields over, because fsm->linkage_info will be
714+ * freed during the call to fsm_merge below. */
715+ {
716+ struct linkage_info * linkage_info = fsm -> linkage_info ;
717+
718+ ainfo -> unanchored_start_loop = linkage_info -> unanchored_start_loop ;
719+ ainfo -> unanchored_end_loop = linkage_info -> unanchored_end_loop ;
720+ ainfo -> unanchored_end_loop_end = linkage_info -> unanchored_end_loop_end ;
721+
722+ /* Transfer ownership of these. */
723+ ainfo -> anchored_starts = linkage_info -> anchored_starts ;
724+ linkage_info -> anchored_starts = NULL ;
725+ ainfo -> anchored_ends = linkage_info -> anchored_ends ;
726+ linkage_info -> anchored_ends = NULL ;
727+ }
728+
961729 /* Identify various states in the NFA that will be relevant to combining. */
962- if (!analyze_group_nfa (fsm , & ainfos [ i ] )) {
730+ if (!analyze_group_nfa (fsm , ainfo )) {
963731 goto fail ;
964732 }
965733
0 commit comments