Rewrite command-line splitting to tokenize quoted strings

stecman · stecman · commit 2488ee1f5e0b · 2018-02-13T01:56:09.000+13:00
This replaces the previous simple "split on breaks" regex with a function that uses a more complete regex and returns tokens shapes. This vastly improves the readability of CompletionContext::splitCommand() Partially resolves #67 (this doesn't support output of quoted strings yet)
diff --git a/src/CompletionContext.php b/src/CompletionContext.php
@@ -61,7 +61,7 @@ class CompletionContext
      *
      * @var string
      */
-    protected $wordBreaks = "'\"()= \t\n";
+    protected $wordBreaks = "= \t\n";
 
     /**
      * Set the whole contents of the command line as a string
@@ -178,12 +178,15 @@ public function setCharIndex($index)
      * This defaults to a sane value based on BASH's word break characters and shouldn't
      * need to be changed unless your completions contain the default word break characters.
      *
+     * @deprecated This is becoming an internal setting that doesn't make sense to expose publicly.
+     *
      * @see wordBreaks
      * @param string $charList - a single string containing all of the characters to break words on
      */
     public function setWordBreaks($charList)
     {
-        $this->wordBreaks = $charList;
+        // Drop quotes from break characters - strings are handled separately to word breaks now
+        $this->wordBreaks = str_replace(array('"', '\''), '', $charList);;
         $this->reset();
     }
 
@@ -194,55 +197,136 @@ public function setWordBreaks($charList)
      */
     protected function splitCommand()
     {
-        $this->words = array();
-        $this->wordIndex = null;
-        $cursor = 0;
-
-        $breaks = preg_quote($this->wordBreaks);
-
-        if (!preg_match_all("/([^$breaks]*)([$breaks]*)/", $this->commandLine, $matches)) {
-            return;
-        }
-
-        // Groups:
-        // 1: Word
-        // 2: Break characters
-        foreach ($matches[0] as $index => $wholeMatch) {
-            // Determine which word the cursor is in
-            $cursor += strlen($wholeMatch);
-            $word = $matches[1][$index];
-            $breaks = $matches[2][$index];
-
-            if ($this->wordIndex === null && $cursor >= $this->charIndex) {
-                $this->wordIndex = $index;
+        $tokens = $this->tokenizeString($this->commandLine);
 
-                // Find the user's cursor position relative to the end of this word
-                // The end of the word is the internal cursor minus any break characters that were captured
-                $cursorWordOffset = $this->charIndex - ($cursor - strlen($breaks));
+        foreach ($tokens as $token) {
+            if ($token['type'] != 'break') {
+                $this->words[] = $this->getTokenValue($token);
+            }
 
-                if ($cursorWordOffset < 0) {
-                    // Cursor is inside the word - truncate the word at the cursor
-                    // (This emulates normal BASH completion behaviour I've observed, though I'm not entirely sure if it's useful)
-                    $word = substr($word, 0, strlen($word) + $cursorWordOffset);
+            // Determine which word index the cursor is inside once we reach it's offset
+            if ($this->wordIndex === null && $this->charIndex <= $token['offsetEnd']) {
+                $this->wordIndex = count($this->words) - 1;
 
-                } elseif ($cursorWordOffset > 0) {
+                if ($token['type'] == 'break') {
                     // Cursor is in the break-space after a word
                     // Push an empty word at the cursor to allow completion of new terms at the cursor, ignoring words ahead
                     $this->wordIndex++;
-                    $this->words[] = $word;
                     $this->words[] = '';
                     continue;
                 }
-            }
 
-            if ($word !== '') {
-                $this->words[] = $word;
+                if ($this->charIndex < $token['offsetEnd']) {
+                    // Cursor is inside the current word - truncate the word at the cursor
+                    // (This emulates normal BASH completion behaviour I've observed, though I'm not entirely sure if it's useful)
+                    $relativeOffset = $this->charIndex - $token['offset'];
+                    $truncated = substr($token['value'], 0, $relativeOffset);
+
+                    $this->words[$this->wordIndex] = $truncated;
+                }
             }
         }
 
-        if ($this->wordIndex > count($this->words) - 1) {
-            $this->wordIndex = count($this->words) - 1;
+        // Cursor position is past the end of the command line string - consider it a new word
+        if ($this->wordIndex === null) {
+            $this->wordIndex = count($this->words);
+            $this->words[] = '';
+        }
+    }
+
+    /**
+     * Return a token's value with escaping and quotes removed
+     *
+     * @see self::tokenizeString()
+     * @param array $token
+     * @return string
+     */
+    protected function getTokenValue($token)
+    {
+        $value = $token['value'];
+
+        // Remove outer quote characters (or first quote if unclosed)
+        if ($token['type'] == 'quoted') {
+            $value = preg_replace('/^(?:[\'"])(.*?)(?:[\'"])?$/', '$1', $value);
         }
+
+        // Remove escape characters
+        $value = preg_replace('/\\\\(.)/', '$1', $value);
+
+        return $value;
+    }
+
+    /**
+     * Break a string into words, quoted strings and non-words (breaks)
+     *
+     * Returns an array of unmodified segments of $string with offset and type information.
+     *
+     * @param string $string
+     * @return array as [ [type => string, value => string, offset => int], ... ]
+     */
+    protected function tokenizeString($string)
+    {
+        // Map capture groups to returned token type
+        $typeMap = array(
+            'double_quote_string' => 'quoted',
+            'single_quote_string' => 'quoted',
+            'word' => 'word',
+            'break' => 'break',
+        );
+
+        // Escape every word break character including whitespace
+        // preg_quote won't work here as it doesn't understand the ignore whitespace flag ("x")
+        $breaks = preg_replace('/(.)/', '\\\$1', $this->wordBreaks);
+
+        $pattern = <<<"REGEX"
+            /(?:
+                (?P<double_quote_string>
+                    "(\\\\.|[^\"\\\\])*(?:"|$)
+                ) |
+                (?P<single_quote_string>
+                    '(\\\\.|[^'\\\\])*(?:'|$)
+                ) |
+                (?P<word>
+                    (?:\\\\.|[^$breaks])+
+                ) |
+                (?P<break>
+                     [$breaks]+
+                )
+            )/x
+REGEX;
+
+        $tokens = array();
+
+        if (!preg_match_all($pattern, $string, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
+            return $tokens;
+        }
+
+        foreach ($matches as $set) {
+            foreach ($set as $groupName => $match) {
+
+                // Ignore integer indices preg_match outputs (duplicates of named groups)
+                if (is_integer($groupName)) {
+                    continue;
+                }
+
+                // Skip if the offset indicates this group didn't match
+                if ($match[1] === -1) {
+                    continue;
+                }
+
+                $tokens[] = array(
+                    'type' => $typeMap[$groupName],
+                    'value' => $match[0],
+                    'offset' => $match[1],
+                    'offsetEnd' => $match[1] + strlen($match[0])
+                );
+
+                // Move to the next set (only one group should match per set)
+                continue;
+            }
+        }
+
+        return $tokens;
     }
 
     /**
diff --git a/tests/Stecman/Component/Symfony/Console/BashCompletion/CompletionContextTest.php b/tests/Stecman/Component/Symfony/Console/BashCompletion/CompletionContextTest.php
@@ -92,6 +92,44 @@ public function testWordBreakingWithSmallInputs()
         $this->assertEquals('', $context->getCurrentWord());
     }
 
+    public function testQuotedStringWordBreaking()
+    {
+        $context = new CompletionContext();
+        $context->setCharIndex(1000);
+        $context->setCommandLine('make horse --legs=3 --name="Jeff the horse" --colour Extreme\ Blanc \'foo " bar\'');
+
+        // Ensure spaces and quotes
+        $this->assertEquals(
+            array(
+                'make',
+                'horse',
+                '--legs',
+                '3',
+                '--name',
+                'Jeff the horse',
+                '--colour',
+                'Extreme Blanc',
+                'foo " bar',
+                '',
+            ),
+            $context->getWords()
+        );
+
+        $context = new CompletionContext();
+        $context->setCommandLine('console --tag=');
+
+        // Cursor after equals symbol on option argument
+        $context->setCharIndex(14);
+        $this->assertEquals(
+            array(
+                'console',
+                '--tag',
+                ''
+            ),
+            $context->getWords()
+        );
+    }
+
     public function testConfigureFromEnvironment()
     {
         putenv("CMDLINE_CONTENTS=beam up li");
diff --git a/tests/Stecman/Component/Symfony/Console/BashCompletion/CompletionHandlerTest.php b/tests/Stecman/Component/Symfony/Console/BashCompletion/CompletionHandlerTest.php
@@ -80,6 +80,21 @@ public function testCompleteOptionFull()
         $this->assertArraySubset(array('--jazz-hands'), $this->getTerms($handler->runCompletion()));
     }
 
+    public function testCompleteOptionEqualsValue()
+    {
+        // Cursor at the "=" sign
+        $handler = $this->createHandler('app completion-aware --option-with-suggestions=');
+        $this->assertEquals(array('one-opt', 'two-opt'), $this->getTerms($handler->runCompletion()));
+
+        // Cursor at an opening quote
+        $handler = $this->createHandler('app completion-aware --option-with-suggestions="');
+        $this->assertEquals(array('one-opt', 'two-opt'), $this->getTerms($handler->runCompletion()));
+
+        // Cursor inside a quote with value
+        $handler = $this->createHandler('app completion-aware --option-with-suggestions="two');
+        $this->assertEquals(array('two-opt'), $this->getTerms($handler->runCompletion()));
+    }
+
     public function testCompleteOptionOrder()
     {
         // Completion of options should be able to happen anywhere after the command name