diff --git a/CMakeLists.txt b/CMakeLists.txt index a9e36de8c..215013d01 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,6 +61,7 @@ check_modular("SPECIALMATRICES") check_modular("STRINGLIST") check_modular("STATS") check_modular("SYSTEM") +check_modular("REGEX") option(FIND_BLAS "Find external BLAS and LAPACK" ON) diff --git a/doc/specs/index.md b/doc/specs/index.md index f709fb2ca..fc3ddeb11 100644 --- a/doc/specs/index.md +++ b/doc/specs/index.md @@ -30,6 +30,7 @@ This is an index/directory of the specifications (specs) for each new module/fea - [optval](./stdlib_optval.html) - Fallback value for optional arguments - [quadrature](./stdlib_quadrature.html) - Numerical integration - [random](./stdlib_random.html) - Probability Distributions random number generator + - [regex](./stdlib_regex.html) - Regular expression pattern matching - [sorting](./stdlib_sorting.html) - Sorting of rank one arrays - [stats](./stdlib_stats.html) - Descriptive Statistics - [stats_distributions_uniform](./stdlib_stats_distribution_uniform.html) - Uniform Probability Distribution diff --git a/doc/specs/stdlib_regex.md b/doc/specs/stdlib_regex.md new file mode 100644 index 000000000..bb1fa75e8 --- /dev/null +++ b/doc/specs/stdlib_regex.md @@ -0,0 +1,138 @@ +--- +title: regex +--- + +# Regular Expressions + +[TOC] + +## Overview + +The `stdlib_regex` module provides a pure Fortran regular expression engine +based on Thompson's NFA (Nondeterministic Finite Automaton) construction. +It guarantees linear-time matching `O(n × m)` with no backtracking, +making it safe for use with arbitrary input without risk of catastrophic +performance degradation. + +### Supported Syntax + +| Pattern | Description | Example | +|-------------|--------------------------------------|------------------| +| `.` | Match any single character | `a.c` → `abc` | +| `*` | Zero or more of preceding element | `ab*c` → `ac` | +| `+` | One or more of preceding element | `ab+c` → `abbc` | +| `?` | Zero or one of preceding element | `colou?r` | +| `\|` | Alternation | `cat\|dog` | +| `(` `)` | Grouping | `(ab)+` | +| `[...]` | Character class | `[a-z]` | +| `[^...]` | Negated character class | `[^0-9]` | +| `^` | Start of string anchor | `^foo` | +| `$` | End of string anchor | `bar$` | +| `\d` | Digit `[0-9]` | `\d+` | +| `\w` | Word character `[a-zA-Z0-9_]` | `\w+` | +| `\s` | Whitespace (space, tab, newline, CR) | `\s+` | +| `\` | Escape next character | `\.` | + +## `regex_type` - Regular expression type + +### Status + +Experimental + +### Description + +A derived type representing a compiled regular expression. It stores the +internal NFA state graph produced by `regcomp` and is passed to `regmatch` +for pattern matching. + +### Syntax + +```fortran +type(regex_type) :: re +``` + +## `regcomp` - Compile a regular expression + +### Status + +Experimental + +### Description + +Compiles a regular expression pattern string into a `regex_type` object. +The compiled object can then be reused for multiple calls to `regmatch` +without recompilation. + +### Syntax + +```fortran +call [[stdlib_regex(module):regcomp(subroutine)]](re, pattern [, status]) +``` + +### Class + +Subroutine + +### Arguments + +`re`: Shall be of type `regex_type`. It is an `intent(out)` argument. +The compiled regular expression. + +`pattern`: Shall be of type `character(len=*)`. It is an `intent(in)` argument. +The regular expression pattern string to compile. + +`status` (optional): Shall be of type `integer`. It is an `intent(out)` argument. +Returns 0 on success, or a non-zero value if the pattern is invalid +(e.g., mismatched parentheses or brackets). + +### Example + +```fortran +{!example/regex/example_regex_regcomp.f90!} +``` + +## `regmatch` - Match a compiled regular expression + +### Status + +Experimental + +### Description + +Searches for the first occurrence of the compiled regular expression `re` +within the input `string`. If a match is found, `is_match` is set to `.true.` +and the optional `match_start` and `match_end` arguments are set to +the 1-based start and end positions of the matched substring. + +### Syntax + +```fortran +call [[stdlib_regex(module):regmatch(subroutine)]](re, string, is_match [, match_start, match_end]) +``` + +### Class + +Subroutine + +### Arguments + +`re`: Shall be of type `regex_type`. It is an `intent(in)` argument. +A compiled regular expression obtained from `regcomp`. + +`string`: Shall be of type `character(len=*)`. It is an `intent(in)` argument. +The input string to search for a match. + +`is_match`: Shall be of type `logical`. It is an `intent(out)` argument. +Set to `.true.` if a match is found, `.false.` otherwise. + +`match_start` (optional): Shall be of type `integer`. It is an `intent(out)` argument. +The 1-based index of the first character of the match. + +`match_end` (optional): Shall be of type `integer`. It is an `intent(out)` argument. +The 1-based index of the last character of the match. + +### Example + +```fortran +{!example/regex/example_regex_regmatch.f90!} +``` diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index c2ce46fcf..103514a2a 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -44,6 +44,9 @@ add_subdirectory(optval) if (STDLIB_QUADRATURE) add_subdirectory(quadrature) endif() +if (STDLIB_REGEX) + add_subdirectory(regex) +endif() add_subdirectory(selection) add_subdirectory(sorting) add_subdirectory(specialfunctions_gamma) diff --git a/example/regex/CMakeLists.txt b/example/regex/CMakeLists.txt new file mode 100644 index 000000000..1e33d32bc --- /dev/null +++ b/example/regex/CMakeLists.txt @@ -0,0 +1,2 @@ +ADD_EXAMPLE(regex_regcomp) +ADD_EXAMPLE(regex_regmatch) diff --git a/example/regex/example_regex_regcomp.f90 b/example/regex/example_regex_regcomp.f90 new file mode 100644 index 000000000..79b93f273 --- /dev/null +++ b/example/regex/example_regex_regcomp.f90 @@ -0,0 +1,10 @@ +program example_regex_regcomp + use stdlib_regex, only: regex_type, regcomp + implicit none + type(regex_type) :: re + integer :: stat + + call regcomp(re, "(cat|dog)s?", stat) + if (stat /= 0) error stop "Invalid regex pattern" + print *, "Pattern compiled successfully." +end program example_regex_regcomp diff --git a/example/regex/example_regex_regmatch.f90 b/example/regex/example_regex_regmatch.f90 new file mode 100644 index 000000000..972b6c302 --- /dev/null +++ b/example/regex/example_regex_regmatch.f90 @@ -0,0 +1,25 @@ +program example_regex_regmatch + use stdlib_regex, only: regex_type, regcomp, regmatch + implicit none + type(regex_type) :: re + logical :: found + integer :: stat, ms, me + + ! Find a sequence of digits + call regcomp(re, "[0-9]+", stat) + call regmatch(re, "foo123bar", found, ms, me) + print "(a,l1,a,i0,a,i0)", "found = ", found, ", ms = ", ms, ", me = ", me + + ! Anchored match + call regcomp(re, "^hello", stat) + call regmatch(re, "hello world", found) + print "(a,l1)", "found = ", found + call regmatch(re, "say hello", found) + print "(a,l1)", "found = ", found + + ! Alternation with optional suffix + call regcomp(re, "(cat|dog)s?", stat) + call regmatch(re, "I like cats", found, ms, me) + print "(a,l1,a,i0,a,i0)", "found = ", found, ", ms = ", ms, ", me = ", me + +end program example_regex_regmatch diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 114b22b42..5b29f6597 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -44,6 +44,7 @@ add_subdirectory(datetime) ADD_SUBDIR(system) ADD_SUBDIR(stats) +ADD_SUBDIR(regex) add_subdirectory(sparse) diff --git a/src/regex/CMakeLists.txt b/src/regex/CMakeLists.txt new file mode 100644 index 000000000..05a711d6e --- /dev/null +++ b/src/regex/CMakeLists.txt @@ -0,0 +1,12 @@ +set(regex_fppFiles + ) + +set(regex_cppFiles + ) + +set(regex_f90Files + stdlib_regex.f90 + ) + +configure_stdlib_target(${PROJECT_NAME}_regex regex_f90Files regex_fppFiles regex_cppFiles) +target_link_libraries(${PROJECT_NAME}_regex PUBLIC ${PROJECT_NAME}_core) diff --git a/src/regex/stdlib_regex.f90 b/src/regex/stdlib_regex.f90 new file mode 100644 index 000000000..2fe3bfd43 --- /dev/null +++ b/src/regex/stdlib_regex.f90 @@ -0,0 +1,657 @@ +module stdlib_regex + use stdlib_ascii, only: TAB, LF, CR + implicit none + private + + public :: regex_type + public :: regcomp + public :: regmatch + + ! Opcodes for NFA states + integer, parameter :: OP_CHAR = 1 + integer, parameter :: OP_ANY = 2 + integer, parameter :: OP_CLASS = 3 + integer, parameter :: OP_START = 4 + integer, parameter :: OP_END = 5 + integer, parameter :: OP_SPLIT = 6 + integer, parameter :: OP_MATCH = 7 + integer, parameter :: OP_JMP = 8 + + ! Tags for tokens + integer, parameter :: TOK_CHAR = 1 + integer, parameter :: TOK_ANY = 2 + integer, parameter :: TOK_CLASS = 3 + integer, parameter :: TOK_START = 4 + integer, parameter :: TOK_END = 5 + integer, parameter :: TOK_STAR = 6 + integer, parameter :: TOK_PLUS = 7 + integer, parameter :: TOK_QUEST = 8 + integer, parameter :: TOK_LPAREN = 9 + integer, parameter :: TOK_RPAREN = 10 + integer, parameter :: TOK_ALT = 11 + integer, parameter :: TOK_CONCAT = 12 + + type :: state_type + integer :: op + character(len=1) :: c + logical :: bmap(0:127) + logical :: invert + integer :: out1 + integer :: out2 + end type state_type + + type :: token_type + integer :: tag + character(len=1) :: c + logical :: bmap(0:127) + logical :: invert + end type token_type + + type :: regex_type + type(state_type), allocatable :: states(:) + integer :: start_state + integer :: n_states + end type regex_type + + type :: out_node + integer :: s + integer :: o + integer :: next + end type out_node + + type :: out_list_type + integer :: head + integer :: tail + end type out_list_type + + type :: frag_type + integer :: start + type(out_list_type) :: out_list + end type frag_type + + type :: thread + integer :: state + integer :: start_pos + end type thread + +contains + + elemental logical function is_term_ender(tag) + integer, intent(in) :: tag + is_term_ender = (tag == TOK_CHAR .or. tag == TOK_ANY .or. & + tag == TOK_CLASS .or. tag == TOK_STAR .or. & + tag == TOK_PLUS .or. tag == TOK_QUEST .or. & + tag == TOK_RPAREN .or. tag == TOK_END .or. & + tag == TOK_START) + end function is_term_ender + + elemental logical function is_term_starter(tag) + integer, intent(in) :: tag + is_term_starter = (tag == TOK_CHAR .or. tag == TOK_ANY .or. & + tag == TOK_CLASS .or. tag == TOK_LPAREN .or. & + tag == TOK_START .or. tag == TOK_END) + end function is_term_starter + + elemental integer function prec(tag) + integer, intent(in) :: tag + if (tag == TOK_STAR .or. tag == TOK_PLUS .or. tag == TOK_QUEST) then + prec = 3 + else if (tag == TOK_CONCAT) then + prec = 2 + else if (tag == TOK_ALT) then + prec = 1 + else + prec = 0 + end if + end function prec + + subroutine tokenize(pattern, tokens, num_tokens, stat) + character(len=*), intent(in) :: pattern + type(token_type), allocatable, intent(out) :: tokens(:) + integer, intent(out) :: num_tokens + integer, intent(out) :: stat + + type(token_type), allocatable :: tmp_tokens(:) + type(token_type) :: t + integer :: i, k, len_p + character(len=1) :: c, c1, c2 + + len_p = len(pattern) + allocate(tmp_tokens(len_p * 4 + 1)) + num_tokens = 0 + stat = 0 + i = 1 + + do while (i <= len_p) + c = pattern(i:i) + t%tag = TOK_CHAR + t%c = ' ' + t%bmap = .false. + t%invert = .false. + + if (c == '\') then + if (i < len_p) then + i = i + 1 + c = pattern(i:i) + end if + t%tag = TOK_CHAR + t%c = c + if (c == 'd') then + t%tag = TOK_CLASS + t%bmap(iachar('0'):iachar('9')) = .true. + else if (c == 's') then + t%tag = TOK_CLASS + t%bmap(iachar(' ')) = .true. + t%bmap(iachar(TAB)) = .true. + t%bmap(iachar(LF)) = .true. + t%bmap(iachar(CR)) = .true. + else if (c == 'w') then + t%tag = TOK_CLASS + t%bmap(iachar('a'):iachar('z')) = .true. + t%bmap(iachar('A'):iachar('Z')) = .true. + t%bmap(iachar('0'):iachar('9')) = .true. + t%bmap(iachar('_')) = .true. + end if + else if (c == '.') then + t%tag = TOK_ANY + else if (c == '*' .or. c == '+' .or. c == '?') then + if (num_tokens == 0) then + stat = 1 + else + ! Valid repeatable tags: CHAR, ANY, CLASS, RPAREN + if (tmp_tokens(num_tokens)%tag /= TOK_CHAR .and. & + tmp_tokens(num_tokens)%tag /= TOK_ANY .and. & + tmp_tokens(num_tokens)%tag /= TOK_CLASS .and. & + tmp_tokens(num_tokens)%tag /= TOK_RPAREN) then + stat = 1 + end if + end if + if (c == '*') t%tag = TOK_STAR + if (c == '+') t%tag = TOK_PLUS + if (c == '?') t%tag = TOK_QUEST + else if (c == '|') then + t%tag = TOK_ALT + else if (c == '(') then + t%tag = TOK_LPAREN + else if (c == ')') then + if (num_tokens > 0) then + if (tmp_tokens(num_tokens)%tag == TOK_LPAREN) stat = 1 + end if + t%tag = TOK_RPAREN + else if (c == '^') then + t%tag = TOK_START + else if (c == '$') then + t%tag = TOK_END + else if (c == '[') then + t%tag = TOK_CLASS + i = i + 1 + if (i <= len_p .and. pattern(i:i) == '^') then + t%invert = .true. + i = i + 1 + end if + do while (i <= len_p .and. pattern(i:i) /= ']') + if (pattern(i:i) == '\') then + i = i + 1 + if (i > len_p) exit + end if + c1 = pattern(i:i) + if (i + 2 <= len_p .and. pattern(i+1:i+1) == '-') then + if (pattern(i+2:i+2) /= ']') then + c2 = pattern(i+2:i+2) + t%bmap(max(0, iachar(c1)):min(127, iachar(c2))) = .true. + i = i + 3 + cycle + end if + end if + k = iachar(c1) + if (k >= 0 .and. k <= 127) t%bmap(k) = .true. + i = i + 1 + end do + if (i > len_p) stat = 1 ! missing ] + else + t%tag = TOK_CHAR + t%c = c + end if + + num_tokens = num_tokens + 1 + tmp_tokens(num_tokens) = t + i = i + 1 + end do + + allocate(tokens(num_tokens * 2 + 1)) + ! Inject concats + k = 0 + do i = 1, num_tokens + if (i > 1) then + if (is_term_ender(tmp_tokens(i-1)%tag) .and. is_term_starter(tmp_tokens(i)%tag)) then + k = k + 1 + tokens(k)%tag = TOK_CONCAT + tokens(k)%c = ' ' + tokens(k)%invert = .false. + tokens(k)%bmap = .false. + end if + end if + k = k + 1 + tokens(k) = tmp_tokens(i) + end do + num_tokens = k + + end subroutine tokenize + + subroutine parse_to_postfix(tokens, num_tokens, postfix, num_postfix, stat) + type(token_type), intent(in) :: tokens(:) + integer, intent(in) :: num_tokens + type(token_type), allocatable, intent(out) :: postfix(:) + integer, intent(out) :: num_postfix + integer, intent(out) :: stat + + type(token_type), allocatable :: stack(:) + integer :: top, i, tag + + allocate(postfix(num_tokens + 1)) + allocate(stack(num_tokens + 1)) + num_postfix = 0 + top = 0 + stat = 0 + + do i = 1, num_tokens + tag = tokens(i)%tag + if (tag == TOK_CHAR .or. tag == TOK_ANY .or. tag == TOK_CLASS .or. & + tag == TOK_START .or. tag == TOK_END) then + num_postfix = num_postfix + 1 + postfix(num_postfix) = tokens(i) + else if (tag == TOK_STAR .or. tag == TOK_PLUS .or. tag == TOK_QUEST) then + num_postfix = num_postfix + 1 + postfix(num_postfix) = tokens(i) + else if (tag == TOK_LPAREN) then + top = top + 1 + stack(top) = tokens(i) + else if (tag == TOK_RPAREN) then + do while (top > 0) + if (stack(top)%tag == TOK_LPAREN) exit + num_postfix = num_postfix + 1 + postfix(num_postfix) = stack(top) + top = top - 1 + end do + if (top == 0) then + stat = 1 ! mismatched parens + return + end if + top = top - 1 + else if (tag == TOK_CONCAT .or. tag == TOK_ALT) then + do while (top > 0) + if (stack(top)%tag == TOK_LPAREN) exit + if (prec(stack(top)%tag) < prec(tag)) exit + num_postfix = num_postfix + 1 + postfix(num_postfix) = stack(top) + top = top - 1 + end do + top = top + 1 + stack(top) = tokens(i) + end if + end do + + do while (top > 0) + if (stack(top)%tag == TOK_LPAREN) then + stat = 1 + return + end if + num_postfix = num_postfix + 1 + postfix(num_postfix) = stack(top) + top = top - 1 + end do + end subroutine parse_to_postfix + + subroutine new_out(s, o, pool, p_size, return_idx) + integer, intent(in) :: s, o + type(out_node), intent(inout) :: pool(:) + integer, intent(inout) :: p_size + integer, intent(out) :: return_idx + p_size = p_size + 1 + pool(p_size)%s = s + pool(p_size)%o = o + pool(p_size)%next = 0 + return_idx = p_size + end subroutine new_out + + subroutine merge_lists(l1, l2, res, pool) + type(out_list_type), intent(in) :: l1, l2 + type(out_list_type), intent(out) :: res + type(out_node), intent(inout) :: pool(:) + if (l1%head == 0) then + res = l2 + else if (l2%head == 0) then + res = l1 + else + pool(l1%tail)%next = l2%head + res%head = l1%head + res%tail = l2%tail + end if + end subroutine merge_lists + + subroutine do_patch(list, target, pool, states) + type(out_list_type), intent(in) :: list + integer, intent(in) :: target + type(out_node), intent(in) :: pool(:) + type(state_type), intent(inout) :: states(:) + integer :: curr + curr = list%head + do while (curr /= 0) + if (pool(curr)%o == 1) then + states(pool(curr)%s)%out1 = target + else + states(pool(curr)%s)%out2 = target + end if + curr = pool(curr)%next + end do + end subroutine do_patch + + subroutine build_nfa(postfix, num_postfix, states, n_states, start_state, stat) + type(token_type), intent(in) :: postfix(:) + integer, intent(in) :: num_postfix + type(state_type), allocatable, intent(out) :: states(:) + integer, intent(out) :: n_states + integer, intent(out) :: start_state + integer, intent(out) :: stat + + type(frag_type), allocatable :: stack(:) + integer :: top, i, tag, out_idx + type(frag_type) :: f1, f2 + type(out_list_type) :: t_list, empty_list + type(out_node), allocatable :: local_pool(:) + integer :: p_size + + empty_list%head = 0 + empty_list%tail = 0 + + allocate(states((num_postfix+1) * 2)) + allocate(stack((num_postfix+1) * 2)) + allocate(local_pool((num_postfix+1) * 4)) + p_size = 0 + n_states = 0 + top = 0 + stat = 0 + + ! Empty pattern matches immediately + if (num_postfix == 0) then + n_states = n_states + 1 + states(n_states)%op = OP_MATCH + states(n_states)%out1 = 0 + states(n_states)%out2 = 0 + start_state = 1 + return + end if + + do i = 1, num_postfix + tag = postfix(i)%tag + + select case(tag) + case (TOK_CHAR, TOK_ANY, TOK_CLASS, TOK_START, TOK_END) + n_states = n_states + 1 + if (tag == TOK_CHAR) states(n_states)%op = OP_CHAR + if (tag == TOK_ANY) states(n_states)%op = OP_ANY + if (tag == TOK_CLASS) states(n_states)%op = OP_CLASS + if (tag == TOK_START) states(n_states)%op = OP_START + if (tag == TOK_END) states(n_states)%op = OP_END + + states(n_states)%c = postfix(i)%c + states(n_states)%bmap = postfix(i)%bmap + states(n_states)%invert = postfix(i)%invert + states(n_states)%out1 = 0 + states(n_states)%out2 = 0 + + top = top + 1 + stack(top)%start = n_states + call new_out(n_states, 1, local_pool, p_size, out_idx) + stack(top)%out_list%head = out_idx + stack(top)%out_list%tail = out_idx + + case (TOK_CONCAT) + if (top < 2) then; stat = 1; return; end if + f2 = stack(top); top = top - 1 + f1 = stack(top) + + call do_patch(f1%out_list, f2%start, local_pool, states) + stack(top)%start = f1%start + stack(top)%out_list = f2%out_list + + case (TOK_ALT) + if (top < 2) then; stat = 1; return; end if + f2 = stack(top); top = top - 1 + f1 = stack(top) + + n_states = n_states + 1 + states(n_states)%op = OP_SPLIT + states(n_states)%out1 = f1%start + states(n_states)%out2 = f2%start + + stack(top)%start = n_states + call merge_lists(f1%out_list, f2%out_list, stack(top)%out_list, local_pool) + + case (TOK_QUEST) + if (top < 1) then; stat = 1; return; end if + f1 = stack(top) + + n_states = n_states + 1 + states(n_states)%op = OP_SPLIT + states(n_states)%out1 = f1%start + states(n_states)%out2 = 0 + + call new_out(n_states, 2, local_pool, p_size, out_idx) + t_list%head = out_idx + t_list%tail = out_idx + call merge_lists(t_list, f1%out_list, stack(top)%out_list, local_pool) + stack(top)%start = n_states + + case (TOK_STAR) + if (top < 1) then; stat = 1; return; end if + f1 = stack(top) + + n_states = n_states + 1 + states(n_states)%op = OP_SPLIT + states(n_states)%out1 = f1%start + states(n_states)%out2 = 0 + + call do_patch(f1%out_list, n_states, local_pool, states) + + call new_out(n_states, 2, local_pool, p_size, out_idx) + stack(top)%out_list%head = out_idx + stack(top)%out_list%tail = out_idx + stack(top)%start = n_states + + case (TOK_PLUS) + if (top < 1) then; stat = 1; return; end if + f1 = stack(top) + + n_states = n_states + 1 + states(n_states)%op = OP_SPLIT + states(n_states)%out1 = f1%start + states(n_states)%out2 = 0 + + call do_patch(f1%out_list, n_states, local_pool, states) + + call new_out(n_states, 2, local_pool, p_size, out_idx) + stack(top)%out_list%head = out_idx + stack(top)%out_list%tail = out_idx + stack(top)%start = f1%start + + end select + end do + + if (top /= 1) then; stat = 1; return; end if + f1 = stack(1) + + n_states = n_states + 1 + states(n_states)%op = OP_MATCH + states(n_states)%out1 = 0 + states(n_states)%out2 = 0 + + call do_patch(f1%out_list, n_states, local_pool, states) + start_state = f1%start + + end subroutine build_nfa + + subroutine regcomp(re, pattern, status) + type(regex_type), intent(out) :: re + character(len=*), intent(in) :: pattern + integer, intent(out), optional :: status + + type(token_type), allocatable :: tokens(:) + type(token_type), allocatable :: postfix(:) + integer :: n_tok, n_post, stat + + call tokenize(pattern, tokens, n_tok, stat) + if (stat /= 0) then + if (present(status)) status = stat + return + end if + + call parse_to_postfix(tokens, n_tok, postfix, n_post, stat) + if (stat /= 0) then + if (present(status)) status = stat + return + end if + + call build_nfa(postfix, n_post, re%states, re%n_states, re%start_state, stat) + if (present(status)) status = stat + end subroutine regcomp + + subroutine add_thread(state_idx, start_pos, step_index, states, str_len, list, count, visited) + integer, intent(in) :: state_idx, start_pos, step_index + type(state_type), intent(in) :: states(:) + integer, intent(in) :: str_len + type(thread), intent(inout) :: list(:) + integer, intent(inout) :: count + integer, intent(inout) :: visited(:) + + integer :: op, curr_state, top + integer, allocatable :: stack(:) + + if (state_idx == 0) return + + allocate(stack(max(1, size(states) * 2))) + top = 1 + stack(top) = state_idx + + do while (top > 0) + curr_state = stack(top) + top = top - 1 + + if (curr_state == 0) cycle + if (visited(curr_state) == step_index) cycle + visited(curr_state) = step_index + + op = states(curr_state)%op + if (op == OP_SPLIT) then + top = top + 1; stack(top) = states(curr_state)%out1 + top = top + 1; stack(top) = states(curr_state)%out2 + else if (op == OP_JMP) then + top = top + 1; stack(top) = states(curr_state)%out1 + else if (op == OP_START) then + if (step_index == 0) then + top = top + 1; stack(top) = states(curr_state)%out1 + end if + else if (op == OP_END) then + if (step_index == str_len) then + top = top + 1; stack(top) = states(curr_state)%out1 + end if + else + count = count + 1 + list(count)%state = curr_state + list(count)%start_pos = start_pos + end if + end do + end subroutine add_thread + + subroutine regmatch(re, string, is_match, match_start, match_end) + type(regex_type), intent(in) :: re + character(len=*), intent(in) :: string + logical, intent(out) :: is_match + integer, intent(out), optional :: match_start + integer, intent(out), optional :: match_end + + type(thread), allocatable :: clist(:), nlist(:) + integer :: c_cnt, n_cnt, i, j, step_index, str_len + integer :: c_code, op + integer :: b_start, b_end + logical :: match_char + type(thread) :: t + integer, allocatable :: visited(:) + + str_len = len(string) + allocate(clist(re%n_states * 2)) + allocate(nlist(re%n_states * 2)) + allocate(visited(re%n_states)) + + b_start = -1 + b_end = -1 + is_match = .false. + + ! Empty matches at the very beginning + visited = -1 + c_cnt = 0 + call add_thread(re%start_state, 1, 0, re%states, str_len, clist, c_cnt, visited) + + do j = 1, c_cnt + if (re%states(clist(j)%state)%op == OP_MATCH) then + b_start = 1 + b_end = 0 + end if + end do + + do i = 1, str_len + step_index = i + n_cnt = 0 + visited = -1 + + do j = 1, c_cnt + t = clist(j) + if (t%state == 0) cycle + op = re%states(t%state)%op + match_char = .false. + + if (op == OP_CHAR) then + if (re%states(t%state)%c == string(i:i)) match_char = .true. + else if (op == OP_ANY) then + match_char = .true. + else if (op == OP_CLASS) then + c_code = iachar(string(i:i)) + if (c_code >= 0 .and. c_code <= 127) then + if (re%states(t%state)%bmap(c_code)) match_char = .true. + end if + if (re%states(t%state)%invert) match_char = .not. match_char + end if + + if (match_char) then + call add_thread(re%states(t%state)%out1, t%start_pos, step_index, re%states, str_len, nlist, n_cnt, visited) + end if + end do + + ! Always see if a new match can start here if we don't have one yet + if (b_start == -1) then + call add_thread(re%start_state, i + 1, step_index, re%states, str_len, nlist, n_cnt, visited) + end if + + do j = 1, n_cnt + if (re%states(nlist(j)%state)%op == OP_MATCH) then + if (b_start == -1 .or. nlist(j)%start_pos < b_start) then + b_start = nlist(j)%start_pos + b_end = i + else if (nlist(j)%start_pos == b_start .and. i > b_end) then + b_end = i + end if + end if + end do + + clist = nlist + c_cnt = n_cnt + end do + + if (b_start /= -1) then + is_match = .true. + if (present(match_start)) match_start = b_start + if (present(match_end)) match_end = b_end + end if + end subroutine regmatch + +end module stdlib_regex diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d90af3fa1..b1519871d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -65,6 +65,9 @@ add_subdirectory(math) if (STDLIB_STRINGLIST) add_subdirectory(stringlist) endif() +if (STDLIB_REGEX) + add_subdirectory(regex) +endif() if (STDLIB_ANSI) add_subdirectory(terminal) endif() diff --git a/test/regex/CMakeLists.txt b/test/regex/CMakeLists.txt new file mode 100644 index 000000000..42bbe6b04 --- /dev/null +++ b/test/regex/CMakeLists.txt @@ -0,0 +1 @@ +ADDTEST(regex) diff --git a/test/regex/catalogue_regex.f90 b/test/regex/catalogue_regex.f90 new file mode 100644 index 000000000..f2373e908 --- /dev/null +++ b/test/regex/catalogue_regex.f90 @@ -0,0 +1,178 @@ +program catalogue_regex + use stdlib_regex + + implicit none + + type(regex_type) :: re + character(len=100) :: line + character(len=20) :: keyword + character(len=:), allocatable :: value + character(len=:), allocatable :: expression + character(len=:), allocatable :: string + character(len=:), allocatable :: expected + + integer :: match_start, match_end, status, ierr, un, un20 + integer :: mismatches + logical :: matched + + open( newunit=un, file = 'catalogue_regex.inp', status = 'old', iostat = ierr ) + if ( ierr /= 0 ) then + write( *, '(a)' ) 'Could not open the file "catalogue_regex.inp"' + write( *, '(a)' ) 'It should exist - please check' + error stop + endif + + open( newunit=un20, file = 'catalogue_regex.report' ) + + mismatches = 0 + + do + read( un, '(a)', iostat = ierr ) line + + if ( ierr /= 0 ) then + exit + endif + + call extract_information( line, keyword, value ) + + select case( keyword ) + case( 'expression' ) + expression = value + + case( 'input' ) + string = value + + case( 'expected' ) + write( un20, '(a)' ) '' + + expected = value + + call regcomp( re, expression, status ) + + if ( status /= 0 ) then + mismatches = mismatches + 1 + write( un20, '(a,i0)' ) 'Error compiling the expression: status = ', status + write( un20, '(a,2a)' ) ' Expression: "', expression, '"' + else + call regmatch( re, string, matched, match_start, match_end ) + + if ( matched ) then + write( un20, '(a,2a)' ) 'Match found:' + write( un20, '(a,2a)' ) ' Expression: "', expression, '"' + write( un20, '(a,2a)' ) ' Input string: "', string, '"' + write( un20, '(a,2a)' ) ' Substring: "', string(match_start:match_end), '"' + write( un20, '(a,2a)' ) ' Expected: "', expected, '"' + if ( expected == string(match_start:match_end) ) then + write( un20, '(a,2a)' ) ' Success!' + else + mismatches = mismatches + 1 + write( un20, '(a,2a)' ) ' MISMATCH!' + endif + else + mismatches = mismatches + 1 + write( un20, '(a,2a)' ) 'NO match found:' + write( un20, '(a,2a)' ) ' Expression: "', expression, '"' + write( un20, '(a,2a)' ) ' Input string: "', string, '"' + write( un20, '(a,2a)' ) ' Substring: (none)' + write( un20, '(a,2a)' ) ' Expected: "', expected, '"' + endif + endif + + case( 'error-exp' ) + write( un20, '(a)' ) '' + call regcomp( re, expression, status ) + + if ( status /= 0 ) then + write( un20, '(a)' ) 'Error detected as expected:' + write( un20, '(a,2a)' ) ' Expression: "', expression, '"' + else + mismatches = mismatches + 1 + write( un20, '(a)' ) 'An error was expected but not detected:' + write( un20, '(a,2a)' ) ' Expression: "', expression, '"' + endif + + case( 'no-match' ) + write( un20, '(a)' ) '' + call regcomp( re, expression, status ) + + if ( status /= 0 ) then + mismatches = mismatches + 1 + write( un20, '(a,i0)' ) 'Error compiling the expression: status = ', status + write( un20, '(a,2a)' ) ' Expression: "', expression, '"' + else + call regmatch( re, string, matched, match_start, match_end ) + + if ( matched ) then + mismatches = mismatches + 1 + write( un20, '(a,2a)' ) 'Match found where none expected:' + write( un20, '(a,2a)' ) ' Expression: "', expression, '"' + write( un20, '(a,2a)' ) ' Input string: "', string, '"' + write( un20, '(a,2a)' ) ' Substring: "', string(match_start:match_end), '"' + write( un20, '(a,2a)' ) ' Expected: (none)' + else + write( un20, '(a,2a)' ) 'No match found, as expected:' + write( un20, '(a,2a)' ) ' Expression: "', expression, '"' + write( un20, '(a,2a)' ) ' Input string: "', string, '"' + write( un20, '(a,2a)' ) ' Expected: (none)' + endif + endif + + case default + ! Treat any other keyword as comment + + end select + enddo + + write( un20, '(/,a,i0)' ) 'Number of mismatches or other errors: ', mismatches + write( *, '(a)' ) 'Program completed' + +contains + +subroutine extract_information( line, keyword, value ) + character(len=*), intent(in) :: line + character(len=*), intent(out) :: keyword + character(len=:), intent(out), allocatable :: value + + character(len=20), dimension(5) :: known_keywords = & + [ 'expression ', & + 'input ', & + 'expected ', & + 'error-exp ', & + 'no-match ' ] + integer :: k1, k2 + + if ( line == " " ) then + keyword = "" + value = "" + return + endif + + read( line, *, iostat = ierr ) keyword + + if ( keyword == 'error-exp' .or. keyword == 'no-match' ) then + value = "" + return + endif + + if ( any( keyword == known_keywords ) ) then + allocate( value, mold = line ) + + k1 = index( line, '"' ) + if ( k1 > 0 ) then + k2 = k1 + index( line(k1+1:), '"' ) + if ( k2 > 0 ) then + value = line(k1+1:k2-1) + else + write( un20, '(a)' ) 'Error interpreting the input line:' + write( un20, '(2a)' ) ' "', trim(line), '"' + write( un20, '(2a)' ) 'Program stopped' + write( *, '(2a)' ) 'Program stopped - error reading input. Please check' + error stop + endif + endif + else + value = "" + endif +end subroutine extract_information + +end program catalogue_regex diff --git a/test/regex/test_regex.f90 b/test/regex/test_regex.f90 new file mode 100644 index 000000000..8ea53bbb3 --- /dev/null +++ b/test/regex/test_regex.f90 @@ -0,0 +1,247 @@ +module test_regex_mod + use testdrive, only : new_unittest, unittest_type, error_type, check + use stdlib_regex, only : regex_type, regcomp, regmatch + implicit none + private + + public :: collect_regex + +contains + + !> Collect all exported unit tests + subroutine collect_regex(testsuite) + !> Collection of tests + type(unittest_type), allocatable, intent(out) :: testsuite(:) + + testsuite = [ & + new_unittest("literal_match", test_literal_match), & + new_unittest("literal_no_match", test_literal_no_match), & + new_unittest("star_operator", test_star_operator), & + new_unittest("plus_char_class", test_plus_char_class), & + new_unittest("alternation_grouping", test_alternation_grouping), & + new_unittest("anchor_start_fail", test_anchor_start_fail), & + new_unittest("anchor_start_pass", test_anchor_start_pass), & + new_unittest("dot_any", test_dot_any), & + new_unittest("question_mark", test_question_mark), & + new_unittest("empty_pattern", test_empty_pattern) & + ] + end subroutine collect_regex + + subroutine test_literal_match(error) + !> Error handling + type(error_type), allocatable, intent(out) :: error + type(regex_type) :: re + integer :: stat, ms, me + logical :: found + + call regcomp(re, "abc", stat) + call check(error, stat == 0, "regcomp failed for 'abc'") + if (allocated(error)) return + + call regmatch(re, "xyz_abc_def", found, ms, me) + call check(error, found, "Should find 'abc' in 'xyz_abc_def'") + if (allocated(error)) return + + call check(error, ms == 5, "match_start should be 5") + if (allocated(error)) return + + call check(error, me == 7, "match_end should be 7") + if (allocated(error)) return + end subroutine test_literal_match + + subroutine test_literal_no_match(error) + !> Error handling + type(error_type), allocatable, intent(out) :: error + type(regex_type) :: re + integer :: stat + logical :: found + + call regcomp(re, "xyz", stat) + call check(error, stat == 0, "regcomp failed for 'xyz'") + if (allocated(error)) return + + call regmatch(re, "abcdef", found) + call check(error, .not. found, "Should not find 'xyz' in 'abcdef'") + if (allocated(error)) return + end subroutine test_literal_no_match + + subroutine test_star_operator(error) + !> Error handling + type(error_type), allocatable, intent(out) :: error + type(regex_type) :: re + integer :: stat + logical :: found + + call regcomp(re, "a*b", stat) + call check(error, stat == 0, "regcomp failed for 'a*b'") + if (allocated(error)) return + + call regmatch(re, "aaaab", found) + call check(error, found, "Should match 'aaaab' with 'a*b'") + if (allocated(error)) return + + call regmatch(re, "b", found) + call check(error, found, "Should match 'b' with 'a*b' (zero a's)") + if (allocated(error)) return + end subroutine test_star_operator + + subroutine test_plus_char_class(error) + !> Error handling + type(error_type), allocatable, intent(out) :: error + type(regex_type) :: re + integer :: stat, ms, me + logical :: found + + call regcomp(re, "[0-9]+", stat) + call check(error, stat == 0, "regcomp failed for '[0-9]+'") + if (allocated(error)) return + + call regmatch(re, "foo123bar", found, ms, me) + call check(error, found, "Should find digits in 'foo123bar'") + if (allocated(error)) return + + call regmatch(re, "no_digits_here", found) + call check(error, .not. found, "Should not find digits in 'no_digits_here'") + if (allocated(error)) return + end subroutine test_plus_char_class + + subroutine test_alternation_grouping(error) + !> Error handling + type(error_type), allocatable, intent(out) :: error + type(regex_type) :: re + integer :: stat + logical :: found + + call regcomp(re, "(dog|cat)s?", stat) + call check(error, stat == 0, "regcomp failed for '(dog|cat)s?'") + if (allocated(error)) return + + call regmatch(re, "I have cats and dogs.", found) + call check(error, found, "Should find 'cats' or 'dogs' in sentence") + if (allocated(error)) return + + call regmatch(re, "I have birds.", found) + call check(error, .not. found, "Should not find 'cat' or 'dog' in 'I have birds.'") + if (allocated(error)) return + end subroutine test_alternation_grouping + + subroutine test_anchor_start_fail(error) + !> Error handling + type(error_type), allocatable, intent(out) :: error + type(regex_type) :: re + integer :: stat + logical :: found + + call regcomp(re, "^foo", stat) + call check(error, stat == 0, "regcomp failed for '^foo'") + if (allocated(error)) return + + call regmatch(re, "bar foo", found) + call check(error, .not. found, "'^foo' should not match 'bar foo'") + if (allocated(error)) return + end subroutine test_anchor_start_fail + + subroutine test_anchor_start_pass(error) + !> Error handling + type(error_type), allocatable, intent(out) :: error + type(regex_type) :: re + integer :: stat + logical :: found + + call regcomp(re, "^foo", stat) + call check(error, stat == 0, "regcomp failed for '^foo'") + if (allocated(error)) return + + call regmatch(re, "foo bar", found) + call check(error, found, "'^foo' should match 'foo bar'") + if (allocated(error)) return + end subroutine test_anchor_start_pass + + subroutine test_dot_any(error) + !> Error handling + type(error_type), allocatable, intent(out) :: error + type(regex_type) :: re + integer :: stat + logical :: found + + call regcomp(re, "a.c", stat) + call check(error, stat == 0, "regcomp failed for 'a.c'") + if (allocated(error)) return + + call regmatch(re, "abc", found) + call check(error, found, "'a.c' should match 'abc'") + if (allocated(error)) return + + call regmatch(re, "aXc", found) + call check(error, found, "'a.c' should match 'aXc'") + if (allocated(error)) return + + call regmatch(re, "ac", found) + call check(error, .not. found, "'a.c' should not match 'ac'") + if (allocated(error)) return + end subroutine test_dot_any + + subroutine test_question_mark(error) + !> Error handling + type(error_type), allocatable, intent(out) :: error + type(regex_type) :: re + integer :: stat + logical :: found + + call regcomp(re, "colou?r", stat) + call check(error, stat == 0, "regcomp failed for 'colou?r'") + if (allocated(error)) return + + call regmatch(re, "color", found) + call check(error, found, "'colou?r' should match 'color'") + if (allocated(error)) return + + call regmatch(re, "colour", found) + call check(error, found, "'colou?r' should match 'colour'") + if (allocated(error)) return + end subroutine test_question_mark + + subroutine test_empty_pattern(error) + !> Error handling + type(error_type), allocatable, intent(out) :: error + type(regex_type) :: re + integer :: stat + logical :: found + + call regcomp(re, "", stat) + call check(error, stat == 0, "regcomp should succeed for empty pattern") + if (allocated(error)) return + + call regmatch(re, "anything", found) + call check(error, found, "Empty pattern should match any string") + if (allocated(error)) return + end subroutine test_empty_pattern + +end module test_regex_mod + + +program tester + use, intrinsic :: iso_fortran_env, only : error_unit + use testdrive, only : run_testsuite, new_testsuite, testsuite_type + use test_regex_mod, only : collect_regex + implicit none + integer :: stat, is + type(testsuite_type), allocatable :: testsuites(:) + character(len=*), parameter :: fmt = '("#", *(1x, a))' + + stat = 0 + + testsuites = [ & + new_testsuite("regex", collect_regex) & + ] + + do is = 1, size(testsuites) + write(error_unit, fmt) "Testing:", testsuites(is)%name + call run_testsuite(testsuites(is)%collect, error_unit, stat) + end do + + if (stat > 0) then + write(error_unit, '(i0, 1x, a)') stat, "test(s) failed!" + error stop + end if +end program