✨ Make text in resp-text optional (IMAP4rev2)

nevans · nevans · commit dde965b2ac2f · 2023-02-12T00:23:51.000-05:00
In RFC3501 (IMAP4rev1):

    resp-text       = ["[" resp-text-code "]" SP] text

In RFC9051 (IMAP4rev2):

    resp-text       = ["[" resp-text-code "]" SP] [text]

And in RFC9051 Appendix E:

     23.  resp-text ABNF non-terminal was updated to allow for empty text.

In the spirit of Appendix E. 23 (and based on some actual server
responses I've seen over the years), I've leniently re-interpreted this
as also allowing us to drop the trailing `SP` char after
`[resp-text-code parsable code data]`, like so:

    resp-text       = "[" resp-text-code "]" [SP [text]] / [text]

Actually, the original parser already _mostly_ behaved this way, because
the original regexps for `T_TEXT` used `*` and not `+`. But, as I
updated the parser in many other places to more closely match the RFCs,
that broke this behavior. This commit originally came _after_ many many
other changes.  While rebasing, I moved this commit first because that
simplified later commits.

Also:
* ♻️ Add `Patterns` module, to organize regexps.
* ♻️ Use `Patterns::CharClassSubtraction` refinement to simplify
  exceptions.
* ♻️ Add `ParserUtils::Generator#def_char_matchers` to define `SP`,
  `LBRA`, `RBRA`.
* ♻️ Add `ParserUtils#{match,accept}_re` to replace `TEXT`, `CTEXT` lex
  states.
* ♻️ Remove unused `lex_state` kwarg from match
diff --git a/lib/net/imap/response_parser.rb b/lib/net/imap/response_parser.rb
@@ -9,6 +9,7 @@ class IMAP < Protocol
     # Parses an \IMAP server response.
     class ResponseParser
       include ParserUtils
+      extend  ParserUtils::Generator
 
       # :call-seq: Net::IMAP::ResponseParser.new -> Net::IMAP::ResponseParser
       def initialize
@@ -38,9 +39,6 @@ def parse(str)
 
       EXPR_BEG   = :EXPR_BEG     # the default, used in most places
       EXPR_DATA  = :EXPR_DATA    # envelope, body(structure), namespaces
-      EXPR_TEXT  = :EXPR_TEXT    # text, after 'resp-text-code "]"'
-      EXPR_RTEXT = :EXPR_RTEXT   # resp-text, before "["
-      EXPR_CTEXT = :EXPR_CTEXT   # resp-text-code, after 'atom SP'
 
       T_SPACE    = :SPACE        # atom special
       T_ATOM     = :ATOM         # atom (subset of astring chars)
@@ -60,6 +58,60 @@ def parse(str)
       T_TEXT     = :TEXT         # any char except CRLF
       T_EOF      = :EOF          # end of response string
 
+      module Patterns
+
+        module CharClassSubtraction
+          refine Regexp do
+            def -(rhs); /[#{source}&&[^#{rhs.source}]]/n.freeze end
+          end
+        end
+        using CharClassSubtraction
+
+        # From RFC5234, "Augmented BNF for Syntax Specifications: ABNF"
+        # >>>
+        #   ALPHA   =  %x41-5A / %x61-7A   ; A-Z / a-z
+        #   CHAR    = %x01-7F
+        #   CRLF    =  CR LF
+        #                   ; Internet standard newline
+        #   CTL     = %x00-1F / %x7F
+        #                ; controls
+        #   DIGIT   =  %x30-39
+        #                   ; 0-9
+        #   DQUOTE  =  %x22
+        #                   ; " (Double Quote)
+        #   HEXDIG  =  DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
+        #   OCTET   = %x00-FF
+        #   SP      =  %x20
+        module RFC5234
+          ALPHA     = /[A-Za-z]/n
+          CHAR      = /[\x01-\x7f]/n
+          CRLF      = /\r\n/n
+          CTL       = /[\x00-\x1F\x7F]/n
+          DIGIT     = /\d/n
+          DQUOTE    = /"/n
+          HEXDIG    = /\h/
+          OCTET     = /[\x00-\xFF]/n # not using /./m for embedding purposes
+          SP        = / /n
+        end
+
+        include RFC5234
+
+        # resp-specials   = "]"
+        RESP_SPECIALS     = /[\]]/n
+
+        # TEXT-CHAR       = <any CHAR except CR and LF>
+        TEXT_CHAR         = CHAR - /[\r\n]/
+
+        # resp-text-code  = ... / atom [SP 1*<any TEXT-CHAR except "]">]
+        CODE_TEXT_CHAR    = TEXT_CHAR - RESP_SPECIALS
+        CODE_TEXT         = /#{CODE_TEXT_CHAR}+/n
+
+        # RFC3501:
+        #   text          = 1*TEXT-CHAR
+        TEXT_rev1         = /#{TEXT_CHAR}+/
+
+      end
+
       # the default, used in most places
       BEG_REGEXP = /\G(?:\
 (?# 1:  SPACE   )( +)|\
@@ -90,20 +142,18 @@ def parse(str)
 (?# 7:  RPAR    )(\)))/ni
 
       # text, after 'resp-text-code "]"'
-      TEXT_REGEXP = /\G(?:\
-(?# 1:  TEXT    )([^\x00\r\n]*))/ni
-
-      # resp-text, before "["
-      RTEXT_REGEXP = /\G(?:\
-(?# 1:  LBRA    )(\[)|\
-(?# 2:  TEXT    )([^\x00\r\n]*))/ni
+      TEXT_REGEXP = /\G(#{Patterns::TEXT_rev1})/n
 
       # resp-text-code, after 'atom SP'
-      CTEXT_REGEXP = /\G(?:\
-(?# 1:  TEXT    )([^\x00\r\n\]]*))/ni
+      CTEXT_REGEXP = /\G(#{Patterns::CODE_TEXT})/n
 
       Token = Struct.new(:symbol, :value)
 
+      def_char_matchers :SP,   " ", :T_SPACE
+
+      def_char_matchers :lbra, "[", :T_LBRA
+      def_char_matchers :rbra, "]", :T_RBRA
+
       # atom            = 1*ATOM-CHAR
       #
       # TODO: match atom entirely by regexp (in the "lexer")
@@ -1143,20 +1193,27 @@ def namespace_response_extensions
       # text            = 1*TEXT-CHAR
       # TEXT-CHAR       = <any CHAR except CR and LF>
       def text
-        match(T_TEXT, lex_state: EXPR_TEXT).value
+        match_re(TEXT_REGEXP, "text")[0]
       end
 
-      # resp-text       = ["[" resp-text-code "]" SP] text
+      # an "accept" versiun of #text
+      def text?
+        accept_re(TEXT_REGEXP)&.[](0)
+      end
+
+      # RFC3501:
+      #   resp-text       = ["[" resp-text-code "]" SP] text
+      # RFC9051:
+      #   resp-text       = ["[" resp-text-code "]" SP] [text]
+      #
+      # We leniently re-interpret this as
+      #   resp-text       = ["[" resp-text-code "]" [SP [text]] / [text]
       def resp_text
-        token = match(T_LBRA, T_TEXT, lex_state: EXPR_RTEXT)
-        case token.symbol
-        when T_LBRA
-          code = resp_text_code
-          match(T_RBRA)
-          accept_space # violating RFC
-          ResponseText.new(code, text)
-        when T_TEXT
-          ResponseText.new(nil, token.value)
+        if lbra?
+          code = resp_text_code; rbra
+          ResponseText.new(code, SP? && text? || "")
+        else
+          ResponseText.new(nil, text? || "")
         end
       end
 
@@ -1198,15 +1255,19 @@ def resp_text_code
           token = lookahead
           if token.symbol == T_SPACE
             shift_token
-            token = match(T_TEXT, lex_state: EXPR_CTEXT)
-            result = ResponseCode.new(name, token.value)
+            result = ResponseCode.new(name, text_chars_except_rbra)
           else
             result = ResponseCode.new(name, nil)
           end
         end
         return result
       end
 
+      # 1*<any TEXT-CHAR except "]">
+      def text_chars_except_rbra
+        match_re(CTEXT_REGEXP, '1*<any TEXT-CHAR except "]">')[0]
+      end
+
       def charset_list
         result = []
         if accept(T_SPACE)
@@ -1447,21 +1508,6 @@ def nil_atom
 
       SPACES_REGEXP = /\G */n
 
-      # This advances @pos directly so it's safe before changing @lex_state.
-      def accept_space
-        if @token
-          if @token.symbol == T_SPACE
-            shift_token
-            " "
-          end
-        elsif @str[@pos] == " "
-          @pos += 1
-          " "
-        end
-      end
-
-      alias SP? accept_space
-
       # The RFC is very strict about this and usually we should be too.
       # But skipping spaces is usually a safe workaround for buggy servers.
       #
@@ -1549,44 +1595,6 @@ def next_token
             @str.index(/\S*/n, @pos)
             parse_error("unknown token - %s", $&.dump)
           end
-        when EXPR_TEXT
-          if @str.index(TEXT_REGEXP, @pos)
-            @pos = $~.end(0)
-            if $1
-              return Token.new(T_TEXT, $+)
-            else
-              parse_error("[Net::IMAP BUG] TEXT_REGEXP is invalid")
-            end
-          else
-            @str.index(/\S*/n, @pos)
-            parse_error("unknown token - %s", $&.dump)
-          end
-        when EXPR_RTEXT
-          if @str.index(RTEXT_REGEXP, @pos)
-            @pos = $~.end(0)
-            if $1
-              return Token.new(T_LBRA, $+)
-            elsif $2
-              return Token.new(T_TEXT, $+)
-            else
-              parse_error("[Net::IMAP BUG] RTEXT_REGEXP is invalid")
-            end
-          else
-            @str.index(/\S*/n, @pos)
-            parse_error("unknown token - %s", $&.dump)
-          end
-        when EXPR_CTEXT
-          if @str.index(CTEXT_REGEXP, @pos)
-            @pos = $~.end(0)
-            if $1
-              return Token.new(T_TEXT, $+)
-            else
-              parse_error("[Net::IMAP BUG] CTEXT_REGEXP is invalid")
-            end
-          else
-            @str.index(/\S*/n, @pos) #/
-            parse_error("unknown token - %s", $&.dump)
-          end
         else
           parse_error("invalid @lex_state - %s", @lex_state.inspect)
         end
diff --git a/lib/net/imap/response_parser/parser_utils.rb b/lib/net/imap/response_parser/parser_utils.rb
@@ -8,26 +8,58 @@ class ResponseParser
       # (internal API, subject to change)
       module ParserUtils # :nodoc:
 
-        private
+        module Generator
+
+          LOOKAHEAD = "(@token ||= next_token)"
+          SHIFT_TOKEN = "(@token = nil)"
+
+          # we can skip lexer for single character matches, as a shortcut
+          def def_char_matchers(name, char, token)
+            match_name = name.match(/\A[A-Z]/) ? "#{name}!" : name
+            char = char.dump
+            class_eval <<~RUBY, __FILE__, __LINE__ + 1
+              # frozen_string_literal: true
 
-        def match(*args, lex_state: @lex_state)
-          if @token && lex_state != @lex_state
-            parse_error("invalid lex_state change to %s with unconsumed token",
-                        lex_state)
+              # like accept(token_symbols); returns token or nil
+              def #{name}?
+                if @token&.symbol == #{token}
+                  #{SHIFT_TOKEN}
+                  #{char}
+                elsif !@token && @str[@pos] == #{char}
+                  @pos += 1
+                  #{char}
+                end
+              end
+
+              # like match(token_symbols); returns token or raises parse_error
+              def #{match_name}
+                if @token&.symbol == #{token}
+                  #{SHIFT_TOKEN}
+                  #{char}
+                elsif !@token && @str[@pos] == #{char}
+                  @pos += 1
+                  #{char}
+                else
+                  parse_error("unexpected %s (expected %p)",
+                              @token&.symbol || @str[@pos].inspect, #{char})
+                end
+              end
+            RUBY
           end
-          begin
-            @lex_state, original_lex_state = lex_state, @lex_state
-            token = lookahead
-            unless args.include?(token.symbol)
-              parse_error('unexpected token %s (expected %s)',
-                          token.symbol.id2name,
-                          args.collect {|i| i.id2name}.join(" or "))
-            end
-            shift_token
-            return token
-          ensure
-            @lex_state = original_lex_state
+
+        end
+
+        private
+
+        def match(*args)
+          token = lookahead
+          unless args.include?(token.symbol)
+            parse_error('unexpected token %s (expected %s)',
+                        token.symbol.id2name,
+                        args.collect {|i| i.id2name}.join(" or "))
           end
+          shift_token
+          token
         end
 
         # like match, but does not raise error on failure.
@@ -42,6 +74,14 @@ def accept(*args)
           end
         end
 
+        # To be used conditionally:
+        #   assert_no_lookahead if Net::IMAP.debug
+        def assert_no_lookahead
+          @token.nil? or
+            parse_error("assertion failed: expected @token.nil?, actual %s: %p",
+                        @token.symbol, @token.value)
+        end
+
         # like accept, without consuming the token
         def lookahead?(*symbols)
           @token if symbols.include?((@token ||= next_token)&.symbol)
@@ -51,6 +91,22 @@ def lookahead
           @token ||= next_token
         end
 
+        def accept_re(re)
+          assert_no_lookahead if Net::IMAP.debug
+          re.match(@str, @pos) and @pos = $~.end(0)
+          $~
+        end
+
+        def match_re(re, name)
+          assert_no_lookahead if Net::IMAP.debug
+          if re.match(@str, @pos)
+            @pos = $~.end(0)
+            $~
+          else
+            parse_error("invalid #{name}")
+          end
+        end
+
         def shift_token
           @token = nil
         end