✨ Add BINARY FETCH support

nevans · nevans · commit 3410e14aadad · 2023-11-10T09:09:06.000-05:00
Lex and parse LITERAL8, string8, nstring8.
Add section_binary and section_part

The BINARY extension isn't _fully_ supported; that requires updates to
the APPEND command.  But this should be sufficient for IMAP4rev2, which
only requires the FETCH part of the extension.
diff --git a/lib/net/imap.rb b/lib/net/imap.rb
@@ -404,18 +404,18 @@ module Net
   #
   # Although IMAP4rev2[https://tools.ietf.org/html/rfc9051] is not supported
   # yet, Net::IMAP supports several extensions that have been folded into it:
-  # +ENABLE+, +IDLE+, +MOVE+, +NAMESPACE+, +SASL-IR+, +UIDPLUS+, and +UNSELECT+.
+  # +ENABLE+, +IDLE+, +MOVE+, +NAMESPACE+, +SASL-IR+, +UIDPLUS+, +UNSELECT+, and
+  # the fetch side of +BINARY+.
   # Commands for these extensions are listed with the {Core IMAP
   # commands}[rdoc-ref:Net::IMAP@Core+IMAP+commands], above.
   #
   # >>>
   #   <em>The following are folded into +IMAP4rev2+ but are currently
   #   unsupported or incompletely supported by</em> Net::IMAP<em>: RFC4466
-  #   extensions, +ESEARCH+, +SEARCHRES+, +LIST-EXTENDED+,
-  #   +LIST-STATUS+, +LITERAL-+, +BINARY+ fetch, and +SPECIAL-USE+.  The
-  #   following extensions are implicitly supported, but will be updated with
-  #   more direct support: RFC5530 response codes, <tt>STATUS=SIZE</tt>, and
-  #   <tt>STATUS=DELETED</tt>.</em>
+  #   extensions, +ESEARCH+, +SEARCHRES+, +LIST-EXTENDED+, +LIST-STATUS+,
+  #   +LITERAL-+, and +SPECIAL-USE+.  The following extensions are implicitly
+  #   supported, but will be updated with more direct support: RFC5530 response
+  #   codes, <tt>STATUS=SIZE</tt>, and <tt>STATUS=DELETED</tt>.</em>
   #
   # ==== RFC2087: +QUOTA+
   # - #getquota: returns the resource usage and limits for a quota root
@@ -437,6 +437,15 @@ module Net
   # ==== RFC2971: +ID+
   # - #id: exchanges client and server implementation information.
   #
+  # ==== RFC3516: +BINARY+
+  # The fetch side of +BINARY+ has been folded into
+  # IMAP4rev2[https://tools.ietf.org/html/rfc9051].
+  # - Updates #fetch and #uid_fetch with the +BINARY+, +BINARY.PEEK+, and
+  #   +BINARY.SIZE+ items.  See FetchData#binary and FetchData#binary_size.
+  #
+  # >>>
+  #   *NOTE:* The binary extension the #append command is _not_ supported yet.
+  #
   # ==== RFC3691: +UNSELECT+
   # Folded into IMAP4rev2[https://tools.ietf.org/html/rfc9051] and also included
   # above with {Core IMAP commands}[rdoc-ref:Net::IMAP@Core+IMAP+commands].
@@ -612,6 +621,10 @@ module Net
   # [ID[https://tools.ietf.org/html/rfc2971]]::
   #   Showalter, T., "IMAP4 ID extension", RFC 2971, DOI 10.17487/RFC2971,
   #   October 2000, <https://www.rfc-editor.org/info/rfc2971>.
+  # [BINARY[https://tools.ietf.org/html/rfc3516]]::
+  #   Nerenberg, L., "IMAP4 Binary Content Extension", RFC 3516,
+  #   DOI 10.17487/RFC3516, April 2003,
+  #   <https://www.rfc-editor.org/info/rfc3516>.
   # [ACL[https://tools.ietf.org/html/rfc4314]]::
   #   Melnikov, A., "IMAP4 Access Control List (ACL) Extension", RFC 4314,
   #   DOI 10.17487/RFC4314, December 2005,
diff --git a/lib/net/imap/fetch_data.rb b/lib/net/imap/fetch_data.rb
@@ -33,6 +33,14 @@ class IMAP < Protocol
     # * <b><tt>"INTERNALDATE"</tt></b> --- See #internaldate.
     # * <b><tt>"RFC822.SIZE"</tt></b> --- See #rfc822_size.
     #
+    # IMAP4rev2[https://www.rfc-editor.org/rfc/rfc9051.html] adds the
+    # additional fetch items from the +BINARY+ extension
+    # {[RFC3516]}[https://www.rfc-editor.org/rfc/rfc3516.html]:
+    #
+    # * <b><tt>"BINARY[#{part}]"</tt></b>,
+    #   <b><tt>"BINARY[#{part}]<#{offset}>"</tt></b> -- See #binary.
+    # * <b><tt>"BINARY.SIZE[#{part}]"</tt></b> -- See #binary_size.
+    #
     # Several static message attributes in
     # IMAP4rev1[https://www.rfc-editor.org/rfc/rfc3501.html] are obsolete and
     # been removed from
@@ -47,8 +55,7 @@ class IMAP < Protocol
     #
     # [Note:]
     #   >>>
-    #     Additional static fields are defined in \IMAP extensions and
-    #     IMAP4rev2[https://www.rfc-editor.org/rfc/rfc9051.html], but
+    #     Additional static fields are defined in other \IMAP extensions, but
     #     Net::IMAP can't parse them yet.
     #
     # ==== Dynamic message attributes
@@ -389,6 +396,49 @@ def rfc822_text; attr["RFC822.TEXT"] end
       # This is the same as getting the value for <tt>"UID"</tt> from #attr.
       def uid; attr["UID"] end
 
+      # :call-seq:
+      #   binary(*part_nums, offset: nil) -> string or nil
+      #
+      # Returns the binary representation of a particular MIME part, which has
+      # already been decoded according to its Content-Transfer-Encoding.
+      #
+      # See #part for a description of +part_nums+ and +offset+.
+      #
+      # This is the same as getting the value of
+      # <tt>"BINARY[#{part_nums.join(".")}]"</tt> or
+      # <tt>"BINARY[#{part_nums.join(".")}]<#{offset}>"</tt> from #attr.
+      #
+      # The server must support either
+      # IMAP4rev2[https://www.rfc-editor.org/rfc/rfc9051.html]
+      # or the +BINARY+ extension
+      # {[RFC3516]}[https://www.rfc-editor.org/rfc/rfc3516.html].
+      #
+      # See also: #binary_size, #mime
+      def binary(*part_nums, offset: nil)
+        attr[section_attr("BINARY", part_nums, offset: offset)]
+      end
+
+      # :call-seq:
+      #   binary_size(*part_nums) -> integer or nil
+      #
+      # Returns the decoded size of a particular MIME part (the size to expect
+      # in response to a <tt>BINARY</tt> fetch request).
+      #
+      # See #part for a description of +part_nums+.
+      #
+      # This is the same as getting the value of
+      # <tt>"BINARY.SIZE[#{part_nums.join(".")}]"</tt> from #attr.
+      #
+      # The server must support either
+      # IMAP4rev2[https://www.rfc-editor.org/rfc/rfc9051.html]
+      # or the +BINARY+ extension
+      # {[RFC3516]}[https://www.rfc-editor.org/rfc/rfc3516.html].
+      #
+      # See also: #binary, #mime
+      def binary_size(*part_nums)
+        attr[section_attr("BINARY.SIZE", part_nums)]
+      end
+
       # :call-seq: modseq -> Integer
       #
       # The modification sequence number associated with this IMAP message.
diff --git a/lib/net/imap/response_parser.rb b/lib/net/imap/response_parser.rb
@@ -54,6 +54,7 @@ def parse(str)
       T_STAR     = :STAR         # atom special; list wildcard
       T_PERCENT  = :PERCENT      # atom special; list wildcard
       T_LITERAL  = :LITERAL      # starts with atom special
+      T_LITERAL8 = :LITERAL8     # starts with atom char "~"
       T_CRLF     = :CRLF         # atom special; text special; quoted special
       T_TEXT     = :TEXT         # any char except CRLF
       T_EOF      = :EOF          # end of response string
@@ -279,6 +280,16 @@ module RFC3629
         #                        ; sent from server to the client.
         LITERAL              = /\{(\d+)\}\r\n/n
 
+        # RFC3516 (BINARY):
+        #   literal8         =   "~{" number "}" CRLF *OCTET
+        #                        ; <number> represents the number of OCTETs
+        #                        ; in the response string.
+        # RFC9051:
+        #   literal8         =  "~{" number64 "}" CRLF *OCTET
+        #                        ; <number64> represents the number of OCTETs
+        #                        ; in the response string.
+        LITERAL8             = /~\{(\d+)\}\r\n/n
+
         module_function
 
         def unescape_quoted!(quoted)
@@ -298,27 +309,28 @@ def unescape_quoted(quoted)
       # the default, used in most places
       BEG_REGEXP = /\G(?:\
 (?# 1:  SPACE   )( )|\
-(?# 2:  ATOM prefixed with a compatible subtype)\
+(?# 2:  LITERAL8)#{Patterns::LITERAL8}|\
+(?# 3:  ATOM prefixed with a compatible subtype)\
 ((?:\
-(?# 3:  NIL     )(NIL)|\
-(?# 4:  NUMBER  )(\d+)|\
-(?# 5:  PLUS    )(\+))\
-(?# 6:  ATOM remaining after prefix )(#{Patterns::ATOMISH})?\
+(?# 4:  NIL     )(NIL)|\
+(?# 5:  NUMBER  )(\d+)|\
+(?# 6:  PLUS    )(\+))\
+(?# 7:  ATOM remaining after prefix )(#{Patterns::ATOMISH})?\
 (?# This enables greedy alternation without lookahead, in linear time.)\
 )|\
 (?# Also need to check for ATOM without a subtype prefix.)\
-(?# 7:  ATOM    )(#{Patterns::ATOMISH})|\
-(?# 8:  QUOTED  )#{Patterns::QUOTED_rev2}|\
-(?# 9: LPAR    )(\()|\
-(?# 10: RPAR    )(\))|\
-(?# 11: BSLASH  )(\\)|\
-(?# 12: STAR    )(\*)|\
-(?# 13: LBRA    )(\[)|\
-(?# 14: RBRA    )(\])|\
-(?# 15: LITERAL )#{Patterns::LITERAL}|\
-(?# 16: PERCENT )(%)|\
-(?# 17: CRLF    )(\r\n)|\
-(?# 18: EOF     )(\z))/ni
+(?# 8:  ATOM    )(#{Patterns::ATOMISH})|\
+(?# 9:  QUOTED  )#{Patterns::QUOTED_rev2}|\
+(?# 10: LPAR    )(\()|\
+(?# 11: RPAR    )(\))|\
+(?# 12: BSLASH  )(\\)|\
+(?# 13: STAR    )(\*)|\
+(?# 14: LBRA    )(\[)|\
+(?# 15: RBRA    )(\])|\
+(?# 16: LITERAL )#{Patterns::LITERAL}|\
+(?# 17: PERCENT )(%)|\
+(?# 18: CRLF    )(\r\n)|\
+(?# 19: EOF     )(\z))/ni
 
       # envelope, body(structure), namespaces
       DATA_REGEXP = /\G(?:\
@@ -359,6 +371,9 @@ def unescape_quoted(quoted)
       #   string          = quoted / literal
       def_token_matchers :string,  T_QUOTED, T_LITERAL
 
+      # used by nstring8 = nstring / literal8
+      def_token_matchers :string8, T_QUOTED, T_LITERAL, T_LITERAL8
+
       # use where string represents "LABEL" values
       def_token_matchers :case_insensitive__string,
                          T_QUOTED, T_LITERAL,
@@ -460,6 +475,10 @@ def nstring
         NIL? ? nil : string
       end
 
+      def nstring8
+        NIL? ? nil : string8
+      end
+
       def nquoted
         NIL? ? nil : quoted
       end
@@ -740,6 +759,8 @@ def msg_att(n)
             when "ENVELOPE"             then envelope
             when "INTERNALDATE"         then date_time
             when "RFC822.SIZE"          then number64
+            when /\ABINARY\[/ni         then nstring8           # BINARY, IMAP4rev2
+            when /\ABINARY\.SIZE\[/ni   then number             # BINARY, IMAP4rev2
             when "RFC822"               then nstring            # not in rev2
             when "RFC822.HEADER"        then nstring            # not in rev2
             when "RFC822.TEXT"          then nstring            # not in rev2
@@ -762,11 +783,18 @@ def msg_att__label
           lbra? and rbra
         when "BODY"
           peek_lbra? and name << section and
-            peek_str?("<") and name << atom # partial
+            peek_str?("<") and name << gt__number__lt # partial
+        when "BINARY", "BINARY.SIZE"
+          name << section_binary
+          # see https://www.rfc-editor.org/errata/eid7246 and the note above
+          peek_str?("<") and name << gt__number__lt # partial
         end
         name
       end
 
+      # this represents the partial size for BODY or BINARY
+      alias gt__number__lt atom
+
       def envelope
         @lex_state = EXPR_DATA
         token = lookahead
@@ -1070,6 +1098,13 @@ def section
         str << rbra
       end
 
+      # section-binary  = "[" [section-part] "]"
+      def section_binary
+        str = +lbra
+        str << section_part unless peek_rbra?
+        str << rbra
+      end
+
       # section-spec    = section-msgtext / (section-part ["." section-text])
       # section-msgtext = "HEADER" /
       #                   "HEADER.FIELDS" [".NOT"] SP header-list /
@@ -1100,6 +1135,11 @@ def header_list
         str << rpar
       end
 
+      # section-part    = nz-number *("." nz-number)
+      #                     ; body part reference.
+      #                     ; Allows for accessing nested body parts.
+      alias section_part atom
+
       # RFC3501 & RFC9051:
       #   header-fld-name = astring
       #
@@ -1789,42 +1829,47 @@ def next_token
             @pos = $~.end(0)
             if $1
               return Token.new(T_SPACE, $+)
-            elsif $2 && $6
+            elsif $2
+              len = $+.to_i
+              val = @str[@pos, len]
+              @pos += len
+              return Token.new(T_LITERAL8, val)
+            elsif $3 && $7
               # greedily match ATOM, prefixed with NUMBER, NIL, or PLUS.
-              return Token.new(T_ATOM, $2)
-            elsif $3
-              return Token.new(T_NIL, $+)
+              return Token.new(T_ATOM, $3)
             elsif $4
-              return Token.new(T_NUMBER, $+)
+              return Token.new(T_NIL, $+)
             elsif $5
+              return Token.new(T_NUMBER, $+)
+            elsif $6
               return Token.new(T_PLUS, $+)
-            elsif $7
+            elsif $8
               # match ATOM, without a NUMBER, NIL, or PLUS prefix
               return Token.new(T_ATOM, $+)
-            elsif $8
-              return Token.new(T_QUOTED, Patterns.unescape_quoted($+))
             elsif $9
-              return Token.new(T_LPAR, $+)
+              return Token.new(T_QUOTED, Patterns.unescape_quoted($+))
             elsif $10
-              return Token.new(T_RPAR, $+)
+              return Token.new(T_LPAR, $+)
             elsif $11
-              return Token.new(T_BSLASH, $+)
+              return Token.new(T_RPAR, $+)
             elsif $12
-              return Token.new(T_STAR, $+)
+              return Token.new(T_BSLASH, $+)
             elsif $13
-              return Token.new(T_LBRA, $+)
+              return Token.new(T_STAR, $+)
             elsif $14
-              return Token.new(T_RBRA, $+)
+              return Token.new(T_LBRA, $+)
             elsif $15
+              return Token.new(T_RBRA, $+)
+            elsif $16
               len = $+.to_i
               val = @str[@pos, len]
               @pos += len
               return Token.new(T_LITERAL, val)
-            elsif $16
-              return Token.new(T_PERCENT, $+)
             elsif $17
-              return Token.new(T_CRLF, $+)
+              return Token.new(T_PERCENT, $+)
             elsif $18
+              return Token.new(T_CRLF, $+)
+            elsif $19
               return Token.new(T_EOF, $+)
             else
               parse_error("[Net::IMAP BUG] BEG_REGEXP is invalid")
diff --git a/test/net/imap/fixtures/response_parser/ruby.png b/test/net/imap/fixtures/response_parser/ruby.png
diff --git a/test/net/imap/test_fetch_data.rb b/test/net/imap/test_fetch_data.rb
@@ -164,4 +164,25 @@ class FetchDataTest < Test::Unit::TestCase
     assert_equal "partial mime", data.mime(1, 2, offset: 456)
   end
 
+  test "#binary(1, 2, 3, offset: 1) returns the BINARY[1.2.3]<1> attr" do
+    data = FetchData.new(1, {
+      "BINARY[]" => "binary\0whole".b,
+      "BINARY[1.2.3]" => "binary\0part".b,
+      "BINARY[1.2.3]<1>" => "inary\0pa".b,
+    })
+    assert_equal "binary\0whole".b, data.binary
+    assert_equal "binary\0part".b,  data.binary(1, 2, 3)
+    assert_equal "inary\0pa".b,     data.binary(1, 2, 3, offset: 1)
+  end
+
+  test "#binary_size(1, 2, 3) returns the BINARY.SIZE[1.2.3] attr" do
+    data = FetchData.new(1, {
+      "BINARY.SIZE[]"      => 987_654,
+      "BINARY.SIZE[1.2.3]" => 123_456,
+    })
+    assert_equal 987_654, data.binary_size
+    assert_equal 123_456, data.binary_size(1, 2, 3)
+    assert_equal 123_456, data.binary_size([1, 2, 3])
+  end
+
 end
diff --git a/test/net/imap/test_imap_response_parser.rb b/test/net/imap/test_imap_response_parser.rb