Skip to content

Commit 92db350

Browse files
committed
⚡️✅ Update regexps to complete in linear time
`BEG_REGEXP` has been significantly changed to run in linear-time when running in ruby 3.2. All lookahead has been eliminated. A correct regexp for `ATOM` is implemented but unused. `ATOMISH` describes the current behavior, which ignores "[" chars. The `msg-att` field labels require the `ATOMISH` definition, for now... A regexp for `TAG` is implemented but also unused for now.
1 parent 6b3b21e commit 92db350

File tree

1 file changed

+84
-33
lines changed

1 file changed

+84
-33
lines changed

lib/net/imap/response_parser.rb

Lines changed: 84 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,39 @@ module RFC3629
130130
include RFC5234
131131
include RFC3629
132132

133+
# CHAR8 = %x01-ff
134+
# ; any OCTET except NUL, %x00
135+
CHAR8 = /[\x01-\xff]/n
136+
137+
# list-wildcards = "%" / "*"
138+
LIST_WILDCARDS = /[%*]/n
133139
# quoted-specials = DQUOTE / "\"
134140
QUOTED_SPECIALS = /["\\]/n
135141
# resp-specials = "]"
136142
RESP_SPECIALS = /[\]]/n
137143

144+
# atomish = 1*<any ATOM-CHAR except "[">
145+
# ; We use "atomish" for msg-att and section, in order
146+
# ; to simplify "BODY[HEADER.FIELDS (foo bar)]".
147+
#
148+
# atom-specials = "(" / ")" / "{" / SP / CTL / list-wildcards /
149+
# quoted-specials / resp-specials
150+
# ATOM-CHAR = <any CHAR except atom-specials>
151+
# atom = 1*ATOM-CHAR
152+
# ASTRING-CHAR = ATOM-CHAR / resp-specials
153+
# tag = 1*<any ASTRING-CHAR except "+">
154+
155+
ATOM_SPECIALS = /[(){ \x00-\x1f\x7f%*"\\\]]/n
156+
ASTRING_SPECIALS = /[(){ \x00-\x1f\x7f%*"\\]/n
157+
158+
ASTRING_CHAR = CHAR - ASTRING_SPECIALS
159+
ATOM_CHAR = CHAR - ATOM_SPECIALS
160+
161+
ATOM = /#{ATOM_CHAR}+/n
162+
ASTRING_CHARS = /#{ASTRING_CHAR}+/n
163+
ATOMISH = /#{ATOM_CHAR - /[\[]/ }+/
164+
TAG = /#{ASTRING_CHAR - /[+]/ }+/
165+
138166
# TEXT-CHAR = <any CHAR except CR and LF>
139167
TEXT_CHAR = CHAR - /[\r\n]/
140168

@@ -167,6 +195,19 @@ module RFC3629
167195
TEXT_rev1 = /#{TEXT_CHAR}+/
168196
TEXT_rev2 = /#{Regexp.union TEXT_CHAR, UTF8_2, UTF8_3, UTF8_4}+/
169197

198+
# RFC3501:
199+
# literal = "{" number "}" CRLF *CHAR8
200+
# ; Number represents the number of CHAR8s
201+
# RFC9051:
202+
# literal = "{" number64 ["+"] "}" CRLF *CHAR8
203+
# ; <number64> represents the number of CHAR8s.
204+
# ; A non-synchronizing literal is distinguished
205+
# ; from a synchronizing literal by the presence of
206+
# ; "+" before the closing "}".
207+
# ; Non-synchronizing literals are not allowed when
208+
# ; sent from server to the client.
209+
LITERAL = /\{(\d+)\}\r\n/n
210+
170211
module_function
171212

172213
def unescape_quoted!(quoted)
@@ -185,30 +226,36 @@ def unescape_quoted(quoted)
185226

186227
# the default, used in most places
187228
BEG_REGEXP = /\G(?:\
188-
(?# 1: SPACE )( +)|\
189-
(?# 2: NIL )(NIL)(?=[\x80-\xff(){ \x00-\x1f\x7f%*"\\\[\]+])|\
190-
(?# 3: NUMBER )(\d+)(?=[\x80-\xff(){ \x00-\x1f\x7f%*"\\\[\]+])|\
191-
(?# 4: ATOM )([^\x80-\xff(){ \x00-\x1f\x7f%*"\\\[\]+]+)|\
192-
(?# 5: QUOTED )#{Patterns::QUOTED_rev2}|\
193-
(?# 6: LPAR )(\()|\
194-
(?# 7: RPAR )(\))|\
195-
(?# 8: BSLASH )(\\)|\
196-
(?# 9: STAR )(\*)|\
197-
(?# 10: LBRA )(\[)|\
198-
(?# 11: RBRA )(\])|\
199-
(?# 12: LITERAL )\{(\d+)\}\r\n|\
200-
(?# 13: PLUS )(\+)|\
201-
(?# 14: PERCENT )(%)|\
202-
(?# 15: CRLF )(\r\n)|\
203-
(?# 16: EOF )(\z))/ni
229+
(?# 1: SPACE )( )|\
230+
(?# 2: ATOM prefixed with a compatible subtype)\
231+
((?:\
232+
(?# 3: NIL )(NIL)|\
233+
(?# 4: NUMBER )(\d+)|\
234+
(?# 5: PLUS )(\+))\
235+
(?# 6: ATOM remaining after prefix )(#{Patterns::ATOMISH})?\
236+
(?# This enables greedy alternation without lookahead, in linear time.)\
237+
)|\
238+
(?# Also need to check for ATOM without a subtype prefix.)\
239+
(?# 7: ATOM )(#{Patterns::ATOMISH})|\
240+
(?# 8: QUOTED )#{Patterns::QUOTED_rev2}|\
241+
(?# 9: LPAR )(\()|\
242+
(?# 10: RPAR )(\))|\
243+
(?# 11: BSLASH )(\\)|\
244+
(?# 12: STAR )(\*)|\
245+
(?# 13: LBRA )(\[)|\
246+
(?# 14: RBRA )(\])|\
247+
(?# 15: LITERAL )#{Patterns::LITERAL}|\
248+
(?# 16: PERCENT )(%)|\
249+
(?# 17: CRLF )(\r\n)|\
250+
(?# 18: EOF )(\z))/ni
204251

205252
# envelope, body(structure), namespaces
206253
DATA_REGEXP = /\G(?:\
207254
(?# 1: SPACE )( )|\
208255
(?# 2: NIL )(NIL)|\
209256
(?# 3: NUMBER )(\d+)|\
210257
(?# 4: QUOTED )#{Patterns::QUOTED_rev2}|\
211-
(?# 5: LITERAL )\{(\d+)\}\r\n|\
258+
(?# 5: LITERAL )#{Patterns::LITERAL}|\
212259
(?# 6: LPAR )(\()|\
213260
(?# 7: RPAR )(\)))/ni
214261

@@ -1501,38 +1548,42 @@ def next_token
15011548
@pos = $~.end(0)
15021549
if $1
15031550
return Token.new(T_SPACE, $+)
1504-
elsif $2
1505-
return Token.new(T_NIL, $+)
1551+
elsif $2 && $6
1552+
# greedily match ATOM, prefixed with NUMBER, NIL, or PLUS.
1553+
return Token.new(T_ATOM, $2)
15061554
elsif $3
1507-
return Token.new(T_NUMBER, $+)
1555+
return Token.new(T_NIL, $+)
15081556
elsif $4
1509-
return Token.new(T_ATOM, $+)
1557+
return Token.new(T_NUMBER, $+)
15101558
elsif $5
1559+
return Token.new(T_PLUS, $+)
1560+
elsif $7
1561+
# match ATOM, without a NUMBER, NIL, or PLUS prefix
1562+
return Token.new(T_ATOM, $+)
1563+
elsif $8
15111564
return Token.new(T_QUOTED, Patterns.unescape_quoted($+))
1512-
elsif $6
1565+
elsif $9
15131566
return Token.new(T_LPAR, $+)
1514-
elsif $7
1567+
elsif $10
15151568
return Token.new(T_RPAR, $+)
1516-
elsif $8
1569+
elsif $11
15171570
return Token.new(T_BSLASH, $+)
1518-
elsif $9
1571+
elsif $12
15191572
return Token.new(T_STAR, $+)
1520-
elsif $10
1573+
elsif $13
15211574
return Token.new(T_LBRA, $+)
1522-
elsif $11
1575+
elsif $14
15231576
return Token.new(T_RBRA, $+)
1524-
elsif $12
1577+
elsif $15
15251578
len = $+.to_i
15261579
val = @str[@pos, len]
15271580
@pos += len
15281581
return Token.new(T_LITERAL, val)
1529-
elsif $13
1530-
return Token.new(T_PLUS, $+)
1531-
elsif $14
1582+
elsif $16
15321583
return Token.new(T_PERCENT, $+)
1533-
elsif $15
1584+
elsif $17
15341585
return Token.new(T_CRLF, $+)
1535-
elsif $16
1586+
elsif $18
15361587
return Token.new(T_EOF, $+)
15371588
else
15381589
parse_error("[Net::IMAP BUG] BEG_REGEXP is invalid")

0 commit comments

Comments
 (0)