Skip to content

Commit bf71d85

Browse files
committed
✨ Add UTF-8 support for both quoted and text
The parser update supports both RFC6855 (UTF8=ALLOW, UTF8=ONLY) and the UTF8 requirements of IMAP4rev2 (resp-text). Also updated #enable documentation and method signature: * document `UTF8=ACCEPT` as "supported" * use `*rest` args => flatten => map(aliases) => uniq * add `:utf8` as an alias for `UTF8=ACCEPT`
1 parent dde965b commit bf71d85

File tree

8 files changed

+231
-51
lines changed

8 files changed

+231
-51
lines changed

benchmarks/generate_parser_benchmarks

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ init = <<RUBY
2828
require "net/imap"
2929
3030
def load_response(file, name)
31-
YAML.unsafe_load_file(file).dig(:tests, name, :response) \\
31+
YAML.unsafe_load_file(file).dig(:tests, name, :response)
32+
.force_encoding "ASCII-8BIT" \\
3233
or abort "ERRORO: missing %p fixture data in %p" % [name, file]
3334
end
3435

benchmarks/parser.yml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ prelude: |2
44
require "net/imap"
55
66
def load_response(file, name)
7-
YAML.unsafe_load_file(file).dig(:tests, name, :response) \
7+
YAML.unsafe_load_file(file).dig(:tests, name, :response)
8+
.force_encoding "ASCII-8BIT" \
89
or abort "ERRORO: missing %p fixture data in %p" % [name, file]
910
end
1011
@@ -560,6 +561,16 @@ benchmark:
560561
response = load_response("../test/net/imap/fixtures/response_parser/thread_responses.yml",
561562
"thread_rfc5256_example5")
562563
script: parser.parse(response)
564+
- name: utf8_in_list_mailbox
565+
prelude: |2
566+
response = load_response("../test/net/imap/fixtures/response_parser/utf8_responses.yml",
567+
"test_utf8_in_list_mailbox")
568+
script: parser.parse(response)
569+
- name: utf8_in_resp_text
570+
prelude: |2
571+
response = load_response("../test/net/imap/fixtures/response_parser/utf8_responses.yml",
572+
"test_utf8_in_resp_text")
573+
script: parser.parse(response)
563574
- name: xlist_inbox
564575
prelude: |2
565576
response = load_response("../test/net/imap/fixtures/response_parser/list_responses.yml",

lib/net/imap.rb

Lines changed: 85 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -489,12 +489,9 @@ module Net
489489
# - #move, #uid_move: Moves the specified messages to the end of the
490490
# specified destination mailbox, expunging them from the current mailbox.
491491
#
492-
#--
493-
# ==== RFC6855: UTF8=ACCEPT
494-
# TODO...
495-
# ==== RFC6855: UTF8=ONLY
496-
# TODO...
497-
#++
492+
# ==== RFC6855: <tt>UTF8=ACCEPT</tt>, <tt>UTF8=ONLY</tt>
493+
#
494+
# - See #enable for information about support foi UTF-8 string encoding.
498495
#
499496
#--
500497
# ==== RFC7888: <tt>LITERAL+</tt>, +LITERAL-+
@@ -679,6 +676,11 @@ module Net
679676
# Gulbrandsen, A. and N. Freed, Ed., "Internet Message Access Protocol
680677
# (\IMAP) - MOVE Extension", RFC 6851, DOI 10.17487/RFC6851, January 2013,
681678
# <https://www.rfc-editor.org/info/rfc6851>.
679+
# [UTF8=ACCEPT[https://tools.ietf.org/html/rfc6855]]::
680+
# [UTF8=ONLY[https://tools.ietf.org/html/rfc6855]]::
681+
# Resnick, P., Ed., Newman, C., Ed., and S. Shen, Ed.,
682+
# "IMAP Support for UTF-8", RFC 6855, DOI 10.17487/RFC6855, March 2013,
683+
# <https://www.rfc-editor.org/info/rfc6855>.
682684
#
683685
# === IANA registries
684686
#
@@ -705,6 +707,12 @@ module Net
705707
class IMAP < Protocol
706708
VERSION = "0.3.4"
707709

710+
# Aliases for supported capabilities, to be used with the #enable command.
711+
ENABLE_ALIASES = {
712+
utf8: "UTF8=ACCEPT",
713+
"UTF8=ONLY" => "UTF8=ACCEPT",
714+
}.freeze
715+
708716
autoload :SASL, File.expand_path("imap/sasl", __dir__)
709717
autoload :StringPrep, File.expand_path("imap/stringprep", __dir__)
710718

@@ -812,12 +820,14 @@ def disconnected?
812820
# Capability requirements—other than +IMAP4rev1+—are listed in the
813821
# documentation for each command method.
814822
#
823+
# Related: #enable
824+
#
815825
# ===== Basic IMAP4rev1 capabilities
816826
#
817827
# All IMAP4rev1 servers must include +IMAP4rev1+ in their capabilities list.
818828
# All IMAP4rev1 servers must _implement_ the +STARTTLS+,
819829
# <tt>AUTH=PLAIN</tt>, and +LOGINDISABLED+ capabilities, and clients must
820-
# respect their presence or absence. See the capabilites requirements on
830+
# respect their presence or absence. See the capabilities requirements on
821831
# #starttls, #login, and #authenticate.
822832
#
823833
# ===== Using IMAP4rev1 extensions
@@ -1886,26 +1896,84 @@ def uid_thread(algorithm, search_keys, charset)
18861896

18871897
# Sends an {ENABLE command [RFC5161 §3.2]}[https://www.rfc-editor.org/rfc/rfc5161#section-3.1]
18881898
# {[IMAP4rev2 §6.3.1]}[https://www.rfc-editor.org/rfc/rfc9051#section-6.3.1]
1889-
# to enable the specified extenstions, which may be either an
1890-
# array or a string. Returns a list of the extensions that were enabled.
1891-
#
1892-
# Some of the extensions that use ENABLE permit the server to send
1893-
# syntax that this class cannot parse. Caution is advised.
1899+
# to enable the specified server +capabilities+. Each capability may be an
1900+
# array, string, or symbol. Returns a list of the capabilities that were
1901+
# enabled.
18941902
#
18951903
# The +ENABLE+ command is only valid in the _authenticated_ state, before
18961904
# any mailbox is selected.
18971905
#
1906+
# Related: #capability
1907+
#
18981908
# ===== Capabilities
18991909
#
1900-
# The server's capabilities must include +ENABLE+
1901-
# [RFC5161[https://tools.ietf.org/html/rfc5161]] or IMAP4REV2
1902-
# [RFC9051[https://tools.ietf.org/html/rfc9051]].
1910+
# The server's capabilities must include
1911+
# +ENABLE+ [RFC5161[https://tools.ietf.org/html/rfc5161]]
1912+
# or +IMAP4REV2+ [RFC9051[https://tools.ietf.org/html/rfc9051]].
19031913
#
19041914
# Additionally, the server capabilities must include a capability matching
19051915
# each enabled extension (usually the same name as the enabled extension).
1906-
def enable(extensions)
1916+
# The following capabilities may be enabled:
1917+
#
1918+
# [+:utf8+ --- an alias for <tt>"UTF8=ACCEPT"</tt>]
1919+
#
1920+
# In a future release, <tt>enable(:utf8)</tt> will enable either
1921+
# <tt>"UTF8=ACCEPT"</tt> or <tt>"IMAP4rev2"</tt>, depending on server
1922+
# capabilities.
1923+
#
1924+
# [<tt>"UTF8=ACCEPT"</tt> [RFC6855[https://tools.ietf.org/html/rfc6855]]]
1925+
#
1926+
# The server's capabilities must include <tt>UTF8=ACCEPT</tt> _or_
1927+
# <tt>UTF8=ONLY</tt>.
1928+
#
1929+
# This allows the server to send strings encoded as UTF-8 which might
1930+
# otherwise need to use a 7-bit encoding, such as {modified
1931+
# UTF-7}[::decode_utf7] for mailbox names, or RFC2047 encoded-words for
1932+
# message headers.
1933+
#
1934+
# *Note:* For now, strings with 8-bit characters are still _sent_ using
1935+
# "literal" syntax. A future update will change how commands send UTF-8
1936+
# strings when <tt>UTF8=ACCEPT</tt> is enabled. This update should be
1937+
# backward-compatible.
1938+
#
1939+
# *Note:* <em>A future update may set string encodings slightly
1940+
# differently</em>, e.g: "US-ASCII" when UTF-8 is not enabled, and "UTF-8"
1941+
# when it is. Currently, the encoding of strings sent as "quoted" or
1942+
# "text" will _always_ be "UTF-8", even when a 7-bit encoding is used
1943+
# (e.g. UTF-7, encoded-words, quoted-printable, base64). And currently,
1944+
# string "literals" sent by the server will always have an "ASCII-8BIT"
1945+
# (binary) encoding, even if they must contain UTF-8 data---although a
1946+
# server _should_ use "quoted" strings once <tt>UTF8=ACCEPT</tt> is
1947+
# enabled.
1948+
#
1949+
# [<tt>"UTF8=ONLY"</tt> [RFC6855[https://tools.ietf.org/html/rfc6855]]]
1950+
#
1951+
# A server that reports the <tt>UTF8=ONLY</tt> #capability _requires_ that
1952+
# the client <tt>enable("UTF8=ACCEPT")</tt> before any mailboxes may be
1953+
# selected. For convenience, <tt>enable("UTF8=ONLY")</tt> is aliased to
1954+
# <tt>enable("UTF8=ACCEPT")</tt>.
1955+
#
1956+
# ===== Unsupported capabilities
1957+
#
1958+
# *Note:* Some extensions that use ENABLE permit the server to send syntax
1959+
# that Net::IMAP cannot parse, which may raise an exception and disconnect.
1960+
# Some extensions may work, but the support may be incomplete, untested, or
1961+
# experimental.
1962+
#
1963+
# Until a capability is documented here as supported, enabling it may result
1964+
# in undocumented behavior and a future release may update with incompatible
1965+
# behavior <em>without warning or deprecation</em>.
1966+
#
1967+
# <em>Caution is advised.</em>
1968+
#
1969+
def enable(*capabilities)
1970+
capabilities = capabilities
1971+
.flatten
1972+
.map {|e| ENABLE_ALIASES[e] || e }
1973+
.uniq
1974+
.join(' ')
19071975
synchronize do
1908-
send_command("ENABLE #{[extensions].flatten.join(' ')}")
1976+
send_command("ENABLE #{capabilities}")
19091977
return @responses.delete("ENABLED")[-1]
19101978
end
19111979
end

lib/net/imap/response_parser.rb

Lines changed: 86 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,44 @@ module RFC5234
9494
SP = / /n
9595
end
9696

97+
# UTF-8, a transformation format of ISO 10646
98+
# >>>
99+
# UTF8-1 = %x00-7F
100+
# UTF8-tail = %x80-BF
101+
# UTF8-2 = %xC2-DF UTF8-tail
102+
# UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
103+
# %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
104+
# UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
105+
# %xF4 %x80-8F 2( UTF8-tail )
106+
# UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
107+
# UTF8-octets = *( UTF8-char )
108+
#
109+
# n.b. String * Integer is used for repetition, rather than /x{3}/,
110+
# because ruby 3.2's linear-time cache-based optimization doesn't work
111+
# with "bounded or fixed times repetition nesting in another repetition
112+
# (e.g. /(a{2,3})*/). It is an implementation issue entirely, but we
113+
# believe it is hard to support this case correctly."
114+
# See https://bugs.ruby-lang.org/issues/19104
115+
module RFC3629
116+
UTF8_1 = /[\x00-\x7f]/n # aka ASCII 7bit
117+
UTF8_TAIL = /[\x80-\xBF]/n
118+
UTF8_2 = /[\xC2-\xDF]#{UTF8_TAIL}/n
119+
UTF8_3 = Regexp.union(/\xE0[\xA0-\xBF]#{UTF8_TAIL}/n,
120+
/\xED[\x80-\x9F]#{UTF8_TAIL}/n,
121+
/[\xE1-\xEC]#{ UTF8_TAIL.source * 2}/n,
122+
/[\xEE-\xEF]#{ UTF8_TAIL.source * 2}/n)
123+
UTF8_4 = Regexp.union(/[\xF1-\xF3]#{ UTF8_TAIL.source * 3}/n,
124+
/\xF0[\x90-\xBF]#{UTF8_TAIL.source * 2}/n,
125+
/\xF4[\x80-\x8F]#{UTF8_TAIL.source * 2}/n)
126+
UTF8_CHAR = Regexp.union(UTF8_1, UTF8_2, UTF8_3, UTF8_4)
127+
UTF8_OCTETS = /#{UTF8_CHAR}*/n
128+
end
129+
97130
include RFC5234
131+
include RFC3629
98132

133+
# quoted-specials = DQUOTE / "\"
134+
QUOTED_SPECIALS = /["\\]/n
99135
# resp-specials = "]"
100136
RESP_SPECIALS = /[\]]/n
101137

@@ -106,9 +142,44 @@ module RFC5234
106142
CODE_TEXT_CHAR = TEXT_CHAR - RESP_SPECIALS
107143
CODE_TEXT = /#{CODE_TEXT_CHAR}+/n
108144

145+
# RFC3501:
146+
# QUOTED-CHAR = <any TEXT-CHAR except quoted-specials> /
147+
# "\" quoted-specials
148+
# RFC9051:
149+
# QUOTED-CHAR = <any TEXT-CHAR except quoted-specials> /
150+
# "\" quoted-specials / UTF8-2 / UTF8-3 / UTF8-4
151+
# RFC3501 & RFC9051:
152+
# quoted = DQUOTE *QUOTED-CHAR DQUOTE
153+
QUOTED_CHAR_safe = TEXT_CHAR - QUOTED_SPECIALS
154+
QUOTED_CHAR_esc = /\\#{QUOTED_SPECIALS}/n
155+
QUOTED_CHAR_rev1 = Regexp.union(QUOTED_CHAR_safe, QUOTED_CHAR_esc)
156+
QUOTED_CHAR_rev2 = Regexp.union(QUOTED_CHAR_rev1,
157+
UTF8_2, UTF8_3, UTF8_4)
158+
QUOTED_rev1 = /"(#{QUOTED_CHAR_rev1}*)"/n
159+
QUOTED_rev2 = /"(#{QUOTED_CHAR_rev2}*)"/n
160+
109161
# RFC3501:
110162
# text = 1*TEXT-CHAR
163+
# RFC9051:
164+
# text = 1*(TEXT-CHAR / UTF8-2 / UTF8-3 / UTF8-4)
165+
# ; Non-ASCII text can only be returned
166+
# ; after ENABLE IMAP4rev2 command
111167
TEXT_rev1 = /#{TEXT_CHAR}+/
168+
TEXT_rev2 = /#{Regexp.union TEXT_CHAR, UTF8_2, UTF8_3, UTF8_4}+/
169+
170+
module_function
171+
172+
def unescape_quoted!(quoted)
173+
quoted
174+
&.gsub!(/\\(#{QUOTED_SPECIALS})/n, "\\1")
175+
&.force_encoding("UTF-8")
176+
end
177+
178+
def unescape_quoted(quoted)
179+
quoted
180+
&.gsub(/\\(#{QUOTED_SPECIALS})/n, "\\1")
181+
&.force_encoding("UTF-8")
182+
end
112183

113184
end
114185

@@ -118,7 +189,7 @@ module RFC5234
118189
(?# 2: NIL )(NIL)(?=[\x80-\xff(){ \x00-\x1f\x7f%*"\\\[\]+])|\
119190
(?# 3: NUMBER )(\d+)(?=[\x80-\xff(){ \x00-\x1f\x7f%*"\\\[\]+])|\
120191
(?# 4: ATOM )([^\x80-\xff(){ \x00-\x1f\x7f%*"\\\[\]+]+)|\
121-
(?# 5: QUOTED )"((?:[^\x00\r\n"\\]|\\["\\])*)"|\
192+
(?# 5: QUOTED )#{Patterns::QUOTED_rev2}|\
122193
(?# 6: LPAR )(\()|\
123194
(?# 7: RPAR )(\))|\
124195
(?# 8: BSLASH )(\\)|\
@@ -136,13 +207,13 @@ module RFC5234
136207
(?# 1: SPACE )( )|\
137208
(?# 2: NIL )(NIL)|\
138209
(?# 3: NUMBER )(\d+)|\
139-
(?# 4: QUOTED )"((?:[^\x00\r\n"\\]|\\["\\])*)"|\
210+
(?# 4: QUOTED )#{Patterns::QUOTED_rev2}|\
140211
(?# 5: LITERAL )\{(\d+)\}\r\n|\
141212
(?# 6: LPAR )(\()|\
142213
(?# 7: RPAR )(\)))/ni
143214

144215
# text, after 'resp-text-code "]"'
145-
TEXT_REGEXP = /\G(#{Patterns::TEXT_rev1})/n
216+
TEXT_REGEXP = /\G(#{Patterns::TEXT_rev2})/n
146217

147218
# resp-text-code, after 'atom SP'
148219
CTEXT_REGEXP = /\G(#{Patterns::CODE_TEXT})/n
@@ -1190,15 +1261,20 @@ def namespace_response_extensions
11901261
data
11911262
end
11921263

1193-
# text = 1*TEXT-CHAR
1194-
# TEXT-CHAR = <any CHAR except CR and LF>
1264+
# TEXT-CHAR = <any CHAR except CR and LF>
1265+
# RFC3501:
1266+
# text = 1*TEXT-CHAR
1267+
# RFC9051:
1268+
# text = 1*(TEXT-CHAR / UTF8-2 / UTF8-3 / UTF8-4)
1269+
# ; Non-ASCII text can only be returned
1270+
# ; after ENABLE IMAP4rev2 command
11951271
def text
1196-
match_re(TEXT_REGEXP, "text")[0]
1272+
match_re(TEXT_REGEXP, "text")[0].force_encoding("UTF-8")
11971273
end
11981274

11991275
# an "accept" versiun of #text
12001276
def text?
1201-
accept_re(TEXT_REGEXP)&.[](0)
1277+
accept_re(TEXT_REGEXP)&.[](0)&.force_encoding("UTF-8")
12021278
end
12031279

12041280
# RFC3501:
@@ -1349,9 +1425,7 @@ def address
13491425
mailbox = $3
13501426
host = $4
13511427
for s in [name, route, mailbox, host]
1352-
if s
1353-
s.gsub!(/\\(["\\])/n, "\\1")
1354-
end
1428+
Patterns.unescape_quoted! s
13551429
end
13561430
else
13571431
name = nstring
@@ -1533,8 +1607,7 @@ def next_token
15331607
elsif $4
15341608
return Token.new(T_ATOM, $+)
15351609
elsif $5
1536-
return Token.new(T_QUOTED,
1537-
$+.gsub(/\\(["\\])/n, "\\1"))
1610+
return Token.new(T_QUOTED, Patterns.unescape_quoted($+))
15381611
elsif $6
15391612
return Token.new(T_LPAR, $+)
15401613
elsif $7
@@ -1577,8 +1650,7 @@ def next_token
15771650
elsif $3
15781651
return Token.new(T_NUMBER, $+)
15791652
elsif $4
1580-
return Token.new(T_QUOTED,
1581-
$+.gsub(/\\(["\\])/n, "\\1"))
1653+
return Token.new(T_QUOTED, Patterns.unescape_quoted($+))
15821654
elsif $5
15831655
len = $+.to_i
15841656
val = @str[@pos, len]
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
2+
---
3+
:tests:
4+
5+
test_utf8_in_list_mailbox:
6+
:response: "* LIST () \"/\" \"☃️&☺️\"\r\n"
7+
:expected: !ruby/struct:Net::IMAP::UntaggedResponse
8+
name: LIST
9+
data: !ruby/struct:Net::IMAP::MailboxList
10+
attr: []
11+
delim: "/"
12+
name: "☃️&☺️"
13+
raw_data: !binary |-
14+
KiBMSVNUICgpICIvIiAi4piD77iPJuKYuu+4jyINCg==
15+
16+
test_utf8_in_resp_text:
17+
:response: "* OK 𝖀𝖓𝖎𝖈𝖔𝖉𝖊 «α-ω» ほげ ふが ʇɐɥʍ\r\n"
18+
:expected: !ruby/struct:Net::IMAP::UntaggedResponse
19+
name: OK
20+
data: !ruby/struct:Net::IMAP::ResponseText
21+
text: "𝖀𝖓𝖎𝖈𝖔𝖉𝖊 «α-ω» ほげ ふが ʇɐɥʍ"
22+
raw_data: !binary |-
23+
KiBPSyDwnZaA8J2Wk/Cdlo7wnZaI8J2WlPCdlonwnZaKIMKrzrEtz4nCuyDjgbvjgZIg44G144 GMIMqHyZDJpcqNDQo=

test/net/imap/net_imap_test_helpers.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def generate_tests_from(fixture_data: nil, fixture_file: nil)
4040
case type
4141

4242
when :parser_assert_equal
43-
response = test.fetch(:response)
43+
response = test.fetch(:response).force_encoding "ASCII-8BIT"
4444
expected = test.fetch(:expected)
4545

4646
define_method name do

0 commit comments

Comments
 (0)