Skip to content

Commit 9971fe1

Browse files
committed
Copy some fields from linkage_info, remove analysis.
They need to be copied over (transferring ownership of the state sets) because the linkage_info struct gets freed during the call to fsm_merge. This removes some code that tended to reconstruct the same information by analyzing the NFA structure, but was significantly more compliacted, and in at least one case it led to the wrong result (albeit harmlessly). It's simpler to just save the relevant state IDs during NFA construction.
1 parent fcde3b7 commit 9971fe1

File tree

1 file changed

+28
-260
lines changed

1 file changed

+28
-260
lines changed

src/libfsm/union.c

Lines changed: 28 additions & 260 deletions
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,19 @@ struct analysis_info {
4141
fsm_state_t start; /* start state */
4242

4343
/* The states with a /./ self edge representing the unanchored
44-
* start and end, or NO_STATE. There can be at most one of each. */
44+
* start and end, or LINKAGE_NO_STATE. There can be at most one
45+
* of each. Copied from linkage_info. */
4546
fsm_state_t unanchored_start_loop;
4647
fsm_state_t unanchored_end_loop;
4748

48-
/* The end state following the unanchored end loop. */
49+
/* The end state following the unanchored end loop.
50+
* Copied from linkage_info.*/
4951
fsm_state_t unanchored_end_loop_end;
5052

51-
/* States that link to paths only reachable from the beginning of input. */
53+
/* States that link to paths only reachable from the beginning of input.
54+
* Copied from linkage_info. */
5255
struct state_set *anchored_starts;
53-
54-
/* States leading to an anchored end. */
56+
/* States leading to an anchored end. Copied from linkage_info. */
5557
struct state_set *anchored_ends;
5658

5759
/* States with an outgoing labeled edge to the unanchored end loop. Input
@@ -213,25 +215,6 @@ fsm_union_array(size_t fsm_count,
213215
return res;
214216
}
215217

216-
static bool
217-
state_has_dot_self_edge(const struct fsm *nfa, fsm_state_t s_i)
218-
{
219-
const struct fsm_state *s = &nfa->states[s_i];
220-
221-
struct edge_group_iter ei;
222-
edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &ei);
223-
struct edge_group_iter_info info;
224-
while (edge_set_group_iter_next(&ei, &info)) {
225-
if (info.to != s_i) { continue; }
226-
for (size_t i = 0; i < 256/64; i++) {
227-
if (info.symbols[i] != (uint64_t)-1) { continue; }
228-
}
229-
return true;
230-
}
231-
232-
return false;
233-
}
234-
235218
#if LOG_ANALYZE_GROUP_NFA_RESULTS
236219
static void
237220
dump_state_set(FILE *f, const char *name, const struct state_set *set)
@@ -264,63 +247,6 @@ dump_edge_set(FILE *f, const char *name, fsm_state_t from, const struct edge_set
264247
}
265248
#endif
266249

267-
/* For each state in the epsilon closure, if there's a labeled edge
268-
* to an end state with no outgoing edges, check if the label set is
269-
* only [\n] and there's also an epsilon edge to the same end state.
270-
* If so, this represents an anchored end in the NFA. */
271-
static bool
272-
state_has_epsilon_and_newline_edges_to_same_end(const struct fsm *nfa,
273-
fsm_state_t s_id, struct state_set *s_eclosure, fsm_state_t *dst_end)
274-
{
275-
struct state_iter si;
276-
state_set_reset(s_eclosure, &si);
277-
fsm_state_t ns_i;
278-
while (state_set_next(&si, &ns_i)) {
279-
assert(ns_i < nfa->statecount);
280-
const struct fsm_state *ns = &nfa->states[ns_i];
281-
282-
if (state_set_empty(ns->epsilons)) { continue; }
283-
if (edge_set_empty(ns->edges)) { continue; }
284-
285-
struct edge_group_iter iter;
286-
struct edge_group_iter_info info;
287-
edge_set_group_iter_reset(ns->edges, EDGE_GROUP_ITER_ALL, &iter);
288-
while (edge_set_group_iter_next(&iter, &info)) {
289-
/* Look for an edge set with only '\n' */
290-
if ((info.symbols[0] != (1ULL << '\n'))
291-
|| info.symbols[1] || info.symbols[2] || info.symbols[3]) {
292-
continue;
293-
}
294-
295-
/* If it's an end, look for an epsilon leeding to the same destination */
296-
if (fsm_isend(nfa, info.to)) {
297-
assert(info.to < nfa->statecount);
298-
const struct fsm_state *end_candidate = &nfa->states[info.to];
299-
if (!state_set_empty(end_candidate->epsilons) ||
300-
!edge_set_empty(end_candidate->edges)) {
301-
continue; /* not an anchored end */
302-
}
303-
304-
struct state_iter inner_si;
305-
fsm_state_t os_i;
306-
307-
assert(s_id < nfa->statecount);
308-
const struct fsm_state *s = &nfa->states[s_id];
309-
310-
state_set_reset(s->epsilons, &inner_si);
311-
while (state_set_next(&inner_si, &os_i)) {
312-
if (os_i == info.to) {
313-
*dst_end = info.to;
314-
return true;
315-
}
316-
}
317-
}
318-
}
319-
}
320-
321-
return false;
322-
}
323-
324250
static bool
325251
state_has_labeled_edge_to_eclosure_with_unanchored_end_loop(const struct fsm *nfa,
326252
fsm_state_t s_i, struct state_set **eclosures,
@@ -394,11 +320,7 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo)
394320
}
395321
}
396322

397-
memset(ainfo, 0x00, sizeof(*ainfo));
398323
ainfo->start = NO_STATE;
399-
ainfo->unanchored_start_loop = NO_STATE;
400-
ainfo->unanchored_end_loop = NO_STATE;
401-
ainfo->unanchored_end_loop_end = NO_STATE;
402324
ainfo->eager_match_state = NO_STATE;
403325

404326
if (!fsm_getstart(nfa, &ainfo->start)) {
@@ -445,80 +367,6 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo)
445367
}
446368
}
447369

448-
/* Iterate over the start state's epsilon edges, attempting to
449-
* identify the unanchored start loop and anchored start states
450-
* (if present).
451-
*
452-
* Note: This uses the start state's epsilon set rather than its
453-
* epsilon closure because (by construction) the unanchored
454-
* start loop and anchored start states will both be directly
455-
* connected to the start state. Using the epsilon closure can
456-
* mis-identify the unanchored *end* loop as the start loop, if
457-
* there is a path with only epsilon edges between them. */
458-
struct state_iter si;
459-
state_set_reset(nfa->states[ainfo->start].epsilons, &si);
460-
fsm_state_t ns_i;
461-
while (state_set_next(&si, &ns_i)) {
462-
/* Ignore self edges. */
463-
if (ns_i == ainfo->start) { continue; }
464-
465-
/* If there's a state in the start state's epsilon set that
466-
* has a dot self-edge, it's the unanchored start loop. */
467-
if (state_has_dot_self_edge(nfa, ns_i)) {
468-
if (LOG_ANALYZE_GROUP_NFA) {
469-
fprintf(stderr, "%s: unanchored_start_loop found on state %d\n", __func__, ns_i);
470-
}
471-
472-
/* By construction, the true unanchored start loop is only reachable
473-
* via an epsilon edge from the start state, so if any other state
474-
* has an epsilon or labeled edge to this one, it cannot be the
475-
* unanchored start loop.
476-
*
477-
* This is necessary for cases like '^|x', which produces:
478-
*
479-
* 0 -> 2;
480-
* 0 -> 3;
481-
* 2 -> 2 "\x00" .. "\xff";
482-
* 2 -> 3 "x";
483-
* 3 -> 1;
484-
* 3 -> 3 ?;
485-
*
486-
* start: 0;
487-
* end: 1 = [0];
488-
*
489-
* where this analysis would otherwise identify both 2 (correct)
490-
* and 3 (incorrect) as the unanchored start loop. Both are reachable
491-
* from the start state via an epsilon edge, but the labeled edge
492-
* 2->3 'x' rules 3 out.
493-
* */
494-
if (state_set_contains(ainfo->reachable_from_nonstart_state, ns_i)) {
495-
continue;
496-
}
497-
498-
/* The reachable_from_nonstart_state check handles the other cases,
499-
* but for `$|^` other attempts to distinguish them will fail,
500-
* but by construction the USL will have the earlier state ID. */
501-
if (ainfo->unanchored_start_loop != NO_STATE &&
502-
ainfo->unanchored_start_loop < ns_i) {
503-
continue;
504-
}
505-
506-
ainfo->unanchored_start_loop = ns_i;
507-
continue;
508-
} else {
509-
/* Otherwise, a state without a dot self-edge is an anchored start.
510-
* There may be more than one. */
511-
if (LOG_ANALYZE_GROUP_NFA) {
512-
fprintf(stderr, "%s: anchored_start found on state %d\n", __func__, ns_i);
513-
}
514-
515-
if (!state_set_add(&ainfo->anchored_starts, nfa->alloc, ns_i)) {
516-
goto alloc_fail;
517-
}
518-
continue;
519-
}
520-
}
521-
522370
/* Copy labeled edges from the unanchored start loop and
523371
* its epsilon closure to ainfo->repeatable_firsts, except
524372
* for edges leading back to the unanchored start loop. */
@@ -576,35 +424,8 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo)
576424
* trivially match, but otherwise it would never match. */
577425
ainfo->nullable = start_state_epsilon_closure_matches_empty_string(nfa, eclosures[ainfo->start]);
578426

579-
/* If there's a state with a dot self-edge and an epsilon edge to an end state, it's
580-
* the unanchored end loop. There should only be one. */
581-
for (size_t s_i = 0; s_i < state_count; s_i++) {
582-
const struct fsm_state *s = &nfa->states[s_i];
583-
if (state_has_dot_self_edge(nfa, s_i)) {
584-
struct state_iter si;
585-
state_set_reset(s->epsilons, &si);
586-
fsm_state_t ns_i;
587-
while (state_set_next(&si, &ns_i)) {
588-
if (fsm_isend(nfa, ns_i)) {
589-
assert(ainfo->unanchored_end_loop == NO_STATE);
590-
ainfo->unanchored_end_loop = s_i;
591-
ainfo->unanchored_end_loop_end = ns_i;
592-
break;
593-
}
594-
}
595-
if (ainfo->unanchored_end_loop != NO_STATE) { break; }
596-
}
597-
}
598-
599427
/* Collect states that lead to an anchored end or eager match. */
600428
for (size_t s_i = 0; s_i < state_count; s_i++) {
601-
fsm_state_t dst_end = NO_STATE;
602-
if (state_has_epsilon_and_newline_edges_to_same_end(nfa, s_i, eclosures[s_i], &dst_end)) {
603-
if (!state_set_add(&ainfo->anchored_ends, nfa->alloc, dst_end)) {
604-
goto alloc_fail;
605-
}
606-
}
607-
608429
fsm_state_t indirect_dst = NO_STATE;
609430
if (state_has_labeled_edge_to_eclosure_with_unanchored_end_loop(nfa, s_i, eclosures, ainfo->unanchored_end_loop, &indirect_dst)) {
610431
if (!state_set_add(&ainfo->eager_matches, nfa->alloc, s_i)) {
@@ -619,57 +440,6 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo)
619440
}
620441
}
621442

622-
/* Compare/log the linkage info */
623-
#define COMPARE_LINKAGE_INFO 1
624-
#define LOG_LINKAGE_INFO 1
625-
if (LOG_LINKAGE_INFO) {
626-
struct state_iter si;
627-
state_set_reset(ainfo->anchored_starts, &si);
628-
fsm_state_t s_i;
629-
630-
fprintf(stderr, "ainfo->anchored_starts count: %zd\n", state_set_count(ainfo->anchored_starts));
631-
state_set_reset(ainfo->anchored_starts, &si);
632-
while (state_set_next(&si, &s_i)) {
633-
fprintf(stderr, "ainfo->anchored_starts: %d\n", s_i);
634-
}
635-
636-
fprintf(stderr, "linkage_info->anchored_starts count: %zd\n", state_set_count(nfa->linkage_info->anchored_starts));
637-
state_set_reset(nfa->linkage_info->anchored_starts, &si);
638-
while (state_set_next(&si, &s_i)) {
639-
fprintf(stderr, "linkage_info->anchored_starts: %d\n", s_i);
640-
}
641-
642-
assert(state_set_count(nfa->linkage_info->anchored_starts) >= state_set_count(ainfo->anchored_starts));
643-
state_set_reset(ainfo->anchored_starts, &si);
644-
while (state_set_next(&si, &s_i)) {
645-
assert(state_set_contains(nfa->linkage_info->anchored_starts, s_i));
646-
}
647-
}
648-
649-
if (LOG_LINKAGE_INFO) {
650-
struct state_iter si;
651-
state_set_reset(ainfo->anchored_ends, &si);
652-
fsm_state_t s_i;
653-
654-
fprintf(stderr, "ainfo->anchored_ends count: %zd\n", state_set_count(ainfo->anchored_ends));
655-
state_set_reset(ainfo->anchored_ends, &si);
656-
while (state_set_next(&si, &s_i)) {
657-
fprintf(stderr, "ainfo->anchored_ends: %d\n", s_i);
658-
}
659-
660-
fprintf(stderr, "linkage_info->anchored_ends count: %zd\n", state_set_count(nfa->linkage_info->anchored_ends));
661-
state_set_reset(nfa->linkage_info->anchored_ends, &si);
662-
while (state_set_next(&si, &s_i)) {
663-
fprintf(stderr, "linkage_info->anchored_ends: %d\n", s_i);
664-
}
665-
666-
assert(state_set_count(nfa->linkage_info->anchored_ends) >= state_set_count(ainfo->anchored_ends));
667-
state_set_reset(ainfo->anchored_ends, &si);
668-
while (state_set_next(&si, &s_i)) {
669-
assert(state_set_contains(nfa->linkage_info->anchored_ends, s_i));
670-
}
671-
}
672-
673443
#if LOG_ANALYZE_GROUP_NFA_RESULTS
674444
{
675445
fprintf(stderr, "# analysis_info start %d, usl %d, uel %d, uele %d\n",
@@ -682,27 +452,9 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo)
682452

683453
closure_free(nfa, eclosures, state_count);
684454

685-
if (COMPARE_LINKAGE_INFO) {
686-
/* Check that the analysis and saved linkage_info from ast_compile.c match */
687-
fprintf(stderr, "%s: checking that build-time data matches... usl %d, %d; uel %d, %d; uele %d, %d\n",
688-
__func__,
689-
nfa->linkage_info->unanchored_start_loop, ainfo->unanchored_start_loop,
690-
nfa->linkage_info->unanchored_end_loop, ainfo->unanchored_end_loop,
691-
nfa->linkage_info->unanchored_end_loop_end, ainfo->unanchored_end_loop_end);
692-
693-
if (nfa->linkage_info->unanchored_start_loop != ainfo->unanchored_start_loop) {
694-
fprintf(stderr, "DISAGREEMENT, overriding\n");
695-
ainfo->unanchored_start_loop = nfa->linkage_info->unanchored_start_loop;
696-
}
697-
698-
assert(nfa->linkage_info->unanchored_start_loop == ainfo->unanchored_start_loop);
699-
assert(nfa->linkage_info->unanchored_end_loop == ainfo->unanchored_end_loop);
700-
assert(nfa->linkage_info->unanchored_end_loop_end == ainfo->unanchored_end_loop_end);
701-
}
702-
703455
/* The unanchored start and end loop cannot be the same state. */
704-
assert(nfa->linkage_info->unanchored_start_loop == NO_STATE
705-
|| nfa->linkage_info->unanchored_start_loop != nfa->linkage_info->unanchored_end_loop);
456+
assert(ainfo->unanchored_start_loop == NO_STATE
457+
|| ainfo->unanchored_start_loop != ainfo->unanchored_end_loop);
706458

707459
return true;
708460

@@ -912,8 +664,6 @@ rebase_analysis_info(struct analysis_info *ainfo, fsm_state_t base)
912664
static void
913665
free_analysis(const struct fsm_alloc *alloc, struct analysis_info *ainfo)
914666
{
915-
state_set_free(ainfo->anchored_ends);
916-
state_set_free(ainfo->anchored_starts);
917667
state_set_free(ainfo->eager_matches);
918668
state_set_free(ainfo->needs_indirect_epsilon_edge_to_eager_match_state);
919669
state_set_free(ainfo->reachable_from_nonstart_state);
@@ -958,8 +708,26 @@ fsm_union_repeated_pattern_group(size_t nfa_count,
958708
for (size_t i = 0; i < nfa_count; i++) {
959709
struct fsm *fsm = nfas[i];
960710

711+
struct analysis_info *ainfo = &ainfos[i];
712+
713+
/* Copy these fields over, because fsm->linkage_info will be
714+
* freed during the call to fsm_merge below. */
715+
{
716+
struct linkage_info *linkage_info = fsm->linkage_info;
717+
718+
ainfo->unanchored_start_loop = linkage_info->unanchored_start_loop;
719+
ainfo->unanchored_end_loop = linkage_info->unanchored_end_loop;
720+
ainfo->unanchored_end_loop_end = linkage_info->unanchored_end_loop_end;
721+
722+
/* Transfer ownership of these. */
723+
ainfo->anchored_starts = linkage_info->anchored_starts;
724+
linkage_info->anchored_starts = NULL;
725+
ainfo->anchored_ends = linkage_info->anchored_ends;
726+
linkage_info->anchored_ends = NULL;
727+
}
728+
961729
/* Identify various states in the NFA that will be relevant to combining. */
962-
if (!analyze_group_nfa(fsm, &ainfos[i])) {
730+
if (!analyze_group_nfa(fsm, ainfo)) {
963731
goto fail;
964732
}
965733

0 commit comments

Comments
 (0)