Fixes for ;; and spaces in array concatenation (#7)

c42f · web-flow · commit e6fd15df62bb · 2022-02-16T17:34:12.000+10:00
* Dectect whether array concatenation is either column-major or
  row-major in the first and second dimesions. Report errors for mixed
  orderings.
* Treat ;; as line continuation when used in hcat
* Treat newlines as insignificant when mixed with semicolons as
  separators.
diff --git a/src/green_tree.jl b/src/green_tree.jl
@@ -79,7 +79,7 @@ function _show_green_node(io, node, indent, pos, str, show_trivia)
     if is_leaf
         line = string(posstr, indent, summary(node))
     else
-        line = string(posstr, indent, '[', summary(node), "]")
+        line = string(posstr, indent, '[', summary(node), ']')
     end
     if !is_trivia(node) && is_leaf
         line = rpad(line, 40) * "✔"
diff --git a/src/parse_stream.jl b/src/parse_stream.jl
@@ -51,11 +51,11 @@ numeric_flags(head::SyntaxHead) = numeric_flags(flags(head))
 is_error(head::SyntaxHead)  = kind(head) == K"error"
 
 function Base.summary(head::SyntaxHead)
-    _kind_str(kind(head))
+    untokenize(head, unique=false, include_flag_suff=false)
 end
 
-function untokenize(head::SyntaxHead; include_flag_suff=true)
-    str = untokenize(kind(head))
+function untokenize(head::SyntaxHead; unique=true, include_flag_suff=true)
+    str = untokenize(kind(head); unique=unique)
     if is_dotted(head)
         str = "."*str
     end
diff --git a/src/parser.jl b/src/parser.jl
@@ -2572,10 +2572,6 @@ end
 # Mismatched rows
 # [x y ; z]     ==>  (vcat (row x y) z)
 #
-# Double semicolon with spaces allowed (only) for line continuation
-#v1.7: [x y ;;\n z w]  ==>  (hcat x y z w)
-#v1.7: [x y ;; z w]    ==>  (hcat x y (error) z w)
-#
 # Single elements in rows
 #v1.7: [x ; y ;; z ]  ==>  (ncat-2 (nrow-1 x y) z)
 #v1.7: [x  y ;;; z ]  ==>  (ncat-3 (row x y) z)
@@ -2592,6 +2588,7 @@ end
 function parse_array(ps::ParseState, mark, closer, end_is_symbol)
     ps = ParseState(ps, end_symbol=end_is_symbol)
 
+    array_order = Ref(:unknown)
     # Outer array parsing loop - parse chain of separators with descending
     # precedence such as
     #v1.7: [a ; b ;; c ;;; d ;;;; e] ==> (ncat-4 (ncat-3 (ncat-2 (ncat-1 a b) c) d) e)
@@ -2604,9 +2601,9 @@ function parse_array(ps::ParseState, mark, closer, end_is_symbol)
     #
     # For an excellent overview of Pratt parsing, see
     # https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html
-    (dim, binding_power) = parse_array_separator(ps)
+    (dim, binding_power) = parse_array_separator(ps, array_order)
     while true
-        (next_dim, next_bp) = parse_array_inner(ps, binding_power)
+        (next_dim, next_bp) = parse_array_inner(ps, binding_power, array_order)
         if next_bp == typemin(Int)
             break
         end
@@ -2624,20 +2621,20 @@ function parse_array(ps::ParseState, mark, closer, end_is_symbol)
            (K"ncat", set_numeric_flags(dim))
 end
 
-# Parse equal and ascending precedence chains of array concatenation operators
-# (semicolons, newlines and whitespace). Invariants:
+# Parse equal and ascending precedence chains of array concatenation operators -
+# semicolons, newlines and whitespace. Invariants:
 #
 # * The caller must have already consumed
 #   - The left hand side
-#   - The concatenation operator, providing the current binding_power.
-#   So eg, we're here in the input stream
+#   - The concatenation operator, providing `binding_power`.
+#   So eg, we're here in the input stream, either at an element or closing token
 #                |
 #          [a ;; b ; c ]
 #          [a ;; ]
 #
 # * The caller must call emit() to delimit the AST node for this binding power.
 #
-function parse_array_inner(ps, binding_power)
+function parse_array_inner(ps, binding_power, array_order)
     mark = NO_POSITION
     dim = -1
     bp = binding_power
@@ -2655,13 +2652,13 @@ function parse_array_inner(ps, binding_power)
             # Parse one expression
             mark = position(ps)
             parse_eq_star(ps)
-            (next_dim, next_bp) = parse_array_separator(ps)
+            (next_dim, next_bp) = parse_array_separator(ps, array_order)
         else # bp > binding_power
             # Recurse to parse a separator with greater binding power. Eg:
             # [a ;; b ; c ]
             #       |   ^------ the next input is here
             #       '---------- the mark is here
-            (next_dim, next_bp) = parse_array_inner(ps, bp)
+            (next_dim, next_bp) = parse_array_inner(ps, bp, array_order)
             if bp == 0
                 emit(ps, mark, K"row")
             else
@@ -2674,46 +2671,83 @@ end
 
 # Parse a separator in an array concatenation
 #
-# Here we aim to identify:
+# Here we return a tuple (dim, binding_power) containing
 # * Dimension on which the next separator acts
 # * Binding power (precedence) of the separator, where whitespace binds
 #   tightest:  ... < `;;;` < `;;` < `;`,`\n` < whitespace. We choose binding
 #   power of 0 for whitespace and negative numbers for other separators.
 #
 # FIXME: Error messages for mixed spaces and ;; delimiters
-function parse_array_separator(ps; skip_newlines=false)
-    t = peek_token(ps; skip_newlines=skip_newlines)
-    k = kind(t)
-    if k == K";"
+function parse_array_separator(ps, array_order)
+    sep_mismatch_err = "cannot mix space and ;; separators in an array expression, except to wrap a line"
+    mark = position(ps)
+    t = peek_token(ps, skip_newlines=true)
+    if kind(t) == K";"
+        # Newlines before semicolons are not significant
+        # [a \n ;]     ==> (vcat a)
+        bump_trivia(ps)
         n_semis = 1
         while true
-            bump(ps, TRIVIA_FLAG; skip_newlines=skip_newlines)
+            bump(ps, TRIVIA_FLAG)
             t = peek_token(ps)
-            if kind(t) != K";" || t.had_whitespace
+            if kind(t) != K";"
                 break
             end
+            if t.had_whitespace
+                bump_disallowed_space(ps)
+            end
             n_semis += 1
         end
-        # FIXME - following is ncat, not line continuation
-        # [a ;; \n c]
-        if n_semis == 2 && peek(ps) == K"NewlineWs"
-            # Line continuation
-            # [a b ;; \n \n c]
-            while peek(ps) == K"NewlineWs"
-                bump(ps, TRIVIA_FLAG)
+        had_newline = peek(ps) == K"NewlineWs"
+        # Newlines after semicolons are not significant
+        # [a ; \n]     ==> (vcat a)
+        # [a ; \n\n b] ==> (vcat a b)
+        #v1.7: [a ;; \n b]  ==> (ncat-2 a b)
+        bump_trivia(ps)
+        if n_semis == 2
+            if array_order[] === :row_major
+                if had_newline
+                    # In hcat with spaces as separators, `;;` is a line
+                    # continuation character
+                    #v1.7: [a b ;; \n c]  ==>  (hcat a b c)
+                    #v1.7: [a b \n ;; c]  ==>  (ncat-2 (row a b (error-t)) c)
+                    return (2, 0)
+                else
+                    # Can't mix spaces and multiple ;;
+                    #v1.7:  [a b ;; c]  ==>  (ncat-2 (row a b (error-t)) c)
+                    emit(ps, mark, K"error", TRIVIA_FLAG, error=sep_mismatch_err)
+                end
+            else
+                array_order[] = :column_major
             end
-            return (2, 0)
-        else
-            return (n_semis, -n_semis)
         end
-    elseif k == K"NewlineWs"
+        return (n_semis, -n_semis)
+    end
+    t = peek_token(ps)
+    k = kind(t)
+    if k == K"NewlineWs"
         bump_trivia(ps)
-        # Newlines separate the first dimension
+        # Treat a linebreak prior to a value as a semicolon (ie, separator for
+        # the first dimension) if no previous semicolons observed
+        # [a \n b]  ==> (vcat a b)
+        return (1, -1)
+    elseif k == K","
+        # Treat `,` as semicolon for the purposes of recovery
+        # [a; b, c] ==> (vcat a b (error-t) c)
+        bump(ps, TRIVIA_FLAG, error="unexpected comma in array expression")
         return (1, -1)
     else
         if t.had_whitespace && !is_closing_token(ps, k)
+            if array_order[] === :column_major
+                # Can't mix multiple ;'s and spaces
+                #v1.7:  [a ;; b c]  ==>  (ncat-2 a (row b (error-t) c))
+                bump_trivia(ps, TRIVIA_FLAG, error=sep_mismatch_err)
+            else
+                array_order[] = :row_major
+            end
             return (2, 0)
         else
+            # Something else; use typemin to exit array parsing
             return (typemin(Int), typemin(Int))
         end
     end
@@ -2739,10 +2773,11 @@ function parse_cat(ps::ParseState, closer, end_is_symbol)
         #v1.8: [;;]          ==>  (ncat-2)
         #v1.8: [\n  ;; \n ]  ==>  (ncat-2)
         #v1.7: [;;]          ==>  (ncat-2 (error))
-        n_semis, _ = parse_array_separator(ps; skip_newlines=true)
+        bump_trivia(ps)
+        dim, _ = parse_array_separator(ps, Ref(:unknown))
         min_supported_version(v"1.8", ps, mark, "empty multidimensional array syntax")
         bump_closing_token(ps, closer)
-        return (K"ncat", set_numeric_flags(n_semis))
+        return (K"ncat", set_numeric_flags(dim))
     end
     parse_eq_star(ps)
     k = peek(ps, skip_newlines=true)
diff --git a/src/tokens.jl b/src/tokens.jl
@@ -113,15 +113,17 @@ function is_whitespace(t)
     kind(t) in (K"Whitespace", K"NewlineWs")
 end
 
-function _kind_str(k::Kind)
-    _kind_to_str[k]
-end
-
 """
 Return the string representation of a token kind, or `nothing` if the kind
 represents a class of tokens like K"Identifier".
+
+When `unique=true` only return a string when the kind uniquely defines the
+corresponding input token, otherwise return `nothing`.  When `unique=false`,
+return the name of the kind.
+
+TODO: Replace `untokenize()` with `Base.string()`?
 """
-function untokenize(k::Kind)
-    get(_kind_to_str_unique, k, nothing)
+function untokenize(k::Kind; unique=true)
+    get(unique ? _kind_to_str_unique : _kind_to_str, k, nothing)
 end
 
diff --git a/test/parser.jl b/test/parser.jl
@@ -625,9 +625,6 @@ tests = [
         "[x y]"  =>  "(hcat x y)"
         # Mismatched rows
         "[x y ; z]"  =>  "(vcat (row x y) z)"
-        # Double semicolon with spaces allowed (only) for line continuation
-        "[x y ;;\n z w]"  =>  "(hcat x y z w)"
-        # "[x y ;; z w]"  =>  "(hcat x y (error) z w)" # FIXME
         # Single elements in rows
         ((v=v"1.7",), "[x ; y ;; z ]")  =>  "(ncat-2 (nrow-1 x y) z)"
         ((v=v"1.7",), "[x  y ;;; z ]")  =>  "(ncat-3 (row x y) z)"
@@ -638,6 +635,24 @@ tests = [
         # Column major
         ((v=v"1.7",), "[x ; y ;; z ; w ;;; a ; b ;; c ; d]")  =>
             "(ncat-3 (nrow-2 (nrow-1 x y) (nrow-1 z w)) (nrow-2 (nrow-1 a b) (nrow-1 c d)))"
+        # Array separators
+        # Newlines before semicolons are not significant
+        "[a \n ;]"  =>  "(vcat a)"
+        # Newlines after semicolons are not significant
+        "[a ; \n]"  =>  "(vcat a)"
+        "[a ; \n\n b]"  =>  "(vcat a b)"
+        ((v=v"1.7",), "[a ;; \n b]")  =>  "(ncat-2 a b)"
+        # In hcat with spaces as separators, `;;` is a line
+        # continuation character
+        ((v=v"1.7",), "[a b ;; \n c]")  =>  "(hcat a b c)"
+        ((v=v"1.7",), "[a b \n ;; c]")  =>  "(ncat-2 (row a b (error-t)) c)"
+        # Can't mix spaces and multiple ;'s
+        ((v=v"1.7",), "[a b ;; c]")  =>  "(ncat-2 (row a b (error-t)) c)"
+        # Treat a linebreak prior to a value as a semicolon (ie, separator for
+        # the first dimension) if no previous semicolons observed
+        "[a \n b]"  =>  "(vcat a b)"
+        # Can't mix multiple ;'s and spaces
+        ((v=v"1.7",), "[a ;; b c]")  =>  "(ncat-2 a (row b (error-t) c))"
         # Empty nd arrays
         ((v=v"1.8",), "[;]")   =>  "(ncat-1)"
         ((v=v"1.8",), "[;;]")  =>  "(ncat-2)"