@@ -130,36 +130,57 @@ abstract class RegexString extends Expr {
130
130
/** result is true for those start chars that actually mark a start of a char set. */
131
131
boolean char_set_start ( int pos ) {
132
132
exists ( int index |
133
- char_set_delimiter ( index , pos ) = true and
133
+ // is opening bracket
134
+ this .char_set_delimiter ( index , pos ) = true and
134
135
(
135
- index = 1 and result = true // if a '[' is first in the string (among brackets), it starts a char set
136
+ // if this is the first bracket, `pos` starts a char set
137
+ index = 1 and result = true
136
138
or
139
+ // if the previous char set delimiter was not a closing bracket, `pos` does
140
+ // not start a char set. This is needed to handle cases such as `[[]` (a
141
+ // char set that matches the `[` char)
137
142
index > 1 and
138
- not char_set_delimiter ( index - 1 , _) = false and
143
+ not this . char_set_delimiter ( index - 1 , _) = false and
139
144
result = false
140
145
or
141
- exists ( int p1 |
142
- char_set_delimiter ( index - 1 , p1 ) = false and // if it is preceded by a closing bracket, it starts a char set
146
+ // special handling of cases such as `[][]` (the character-set of the characters `]` and `[`).
147
+ exists ( int prev_closing_bracket_pos |
148
+ // previous bracket is a closing bracket
149
+ this .char_set_delimiter ( index - 1 , prev_closing_bracket_pos ) = false and
143
150
if
144
- exists ( int p2 |
145
- p1 = p2 + 1
146
- or
147
- this .getChar ( p2 + 1 ) = "^" and
148
- p1 = p2 + 2
151
+ // check if the character that comes before the previous closing bracket
152
+ // is an opening bracket (taking `^` into account)
153
+ exists ( int pos_before_prev_closing_bracket |
154
+ if this .getChar ( prev_closing_bracket_pos - 1 ) = "^"
155
+ then pos_before_prev_closing_bracket = prev_closing_bracket_pos - 2
156
+ else pos_before_prev_closing_bracket = prev_closing_bracket_pos - 1
149
157
|
150
- char_set_delimiter ( index - 2 , p2 ) = true // but the closing bracket only closes...
158
+ this . char_set_delimiter ( index - 2 , pos_before_prev_closing_bracket ) = true
151
159
)
152
160
then
153
- exists ( int p2 | char_set_delimiter ( index - 2 , p2 ) = true |
154
- result = char_set_start ( p2 ) .booleanNot ( ) // ...if it is not the first in a char set
161
+ // brackets without anything in between is not valid character ranges, so
162
+ // the first closing bracket in `[]]` and `[^]]` does not count,
163
+ //
164
+ // and we should _not_ mark the second opening bracket in `[][]` and `[^][]`
165
+ // as starting a new char set. ^ ^
166
+ exists ( int pos_before_prev_closing_bracket |
167
+ this .char_set_delimiter ( index - 2 , pos_before_prev_closing_bracket ) = true
168
+ |
169
+ result = this .char_set_start ( pos_before_prev_closing_bracket ) .booleanNot ( )
155
170
)
156
- else result = true
171
+ else
172
+ // if not, `pos` does in fact mark a real start of a character range
173
+ result = true
157
174
)
158
175
)
159
176
)
160
177
}
161
178
162
- /** result denotes if the index is a left bracket */
179
+ /**
180
+ * Helper predicate for chars that could be character-set delimiters.
181
+ * Holds if the (non-escaped) char at `pos` in the string, is the (one-based) `index` occurrence of a bracket (`[` or `]`) in the string.
182
+ * Result if `true` is the char is `[`, and `false` if the char is `]`.
183
+ */
163
184
boolean char_set_delimiter ( int index , int pos ) {
164
185
pos = rank [ index ] ( int p | this .nonEscapedCharAt ( p ) = "[" or this .nonEscapedCharAt ( p ) = "]" ) and
165
186
(
0 commit comments