@@ -61,7 +61,7 @@ class CompletionContext
6161 *
6262 * @var string
6363 */
64- protected $ wordBreaks = "' \" () = \t\n" ;
64+ protected $ wordBreaks = "= \t\n" ;
6565
6666 /**
6767 * Set the whole contents of the command line as a string
@@ -178,12 +178,15 @@ public function setCharIndex($index)
178178 * This defaults to a sane value based on BASH's word break characters and shouldn't
179179 * need to be changed unless your completions contain the default word break characters.
180180 *
181+ * @deprecated This is becoming an internal setting that doesn't make sense to expose publicly.
182+ *
181183 * @see wordBreaks
182184 * @param string $charList - a single string containing all of the characters to break words on
183185 */
184186 public function setWordBreaks ($ charList )
185187 {
186- $ this ->wordBreaks = $ charList ;
188+ // Drop quotes from break characters - strings are handled separately to word breaks now
189+ $ this ->wordBreaks = str_replace (array ('" ' , '\'' ), '' , $ charList );;
187190 $ this ->reset ();
188191 }
189192
@@ -194,55 +197,136 @@ public function setWordBreaks($charList)
194197 */
195198 protected function splitCommand ()
196199 {
197- $ this ->words = array ();
198- $ this ->wordIndex = null ;
199- $ cursor = 0 ;
200-
201- $ breaks = preg_quote ($ this ->wordBreaks );
202-
203- if (!preg_match_all ("/([^ $ breaks]*)([ $ breaks]*)/ " , $ this ->commandLine , $ matches )) {
204- return ;
205- }
206-
207- // Groups:
208- // 1: Word
209- // 2: Break characters
210- foreach ($ matches [0 ] as $ index => $ wholeMatch ) {
211- // Determine which word the cursor is in
212- $ cursor += strlen ($ wholeMatch );
213- $ word = $ matches [1 ][$ index ];
214- $ breaks = $ matches [2 ][$ index ];
215-
216- if ($ this ->wordIndex === null && $ cursor >= $ this ->charIndex ) {
217- $ this ->wordIndex = $ index ;
200+ $ tokens = $ this ->tokenizeString ($ this ->commandLine );
218201
219- // Find the user's cursor position relative to the end of this word
220- // The end of the word is the internal cursor minus any break characters that were captured
221- $ cursorWordOffset = $ this ->charIndex - ($ cursor - strlen ($ breaks ));
202+ foreach ($ tokens as $ token ) {
203+ if ($ token ['type ' ] != 'break ' ) {
204+ $ this ->words [] = $ this ->getTokenValue ($ token );
205+ }
222206
223- if ($ cursorWordOffset < 0 ) {
224- // Cursor is inside the word - truncate the word at the cursor
225- // (This emulates normal BASH completion behaviour I've observed, though I'm not entirely sure if it's useful)
226- $ word = substr ($ word , 0 , strlen ($ word ) + $ cursorWordOffset );
207+ // Determine which word index the cursor is inside once we reach it's offset
208+ if ($ this ->wordIndex === null && $ this ->charIndex <= $ token ['offsetEnd ' ]) {
209+ $ this ->wordIndex = count ($ this ->words ) - 1 ;
227210
228- } elseif ( $ cursorWordOffset > 0 ) {
211+ if ( $ token [ ' type ' ] == ' break ' ) {
229212 // Cursor is in the break-space after a word
230213 // Push an empty word at the cursor to allow completion of new terms at the cursor, ignoring words ahead
231214 $ this ->wordIndex ++;
232- $ this ->words [] = $ word ;
233215 $ this ->words [] = '' ;
234216 continue ;
235217 }
236- }
237218
238- if ($ word !== '' ) {
239- $ this ->words [] = $ word ;
219+ if ($ this ->charIndex < $ token ['offsetEnd ' ]) {
220+ // Cursor is inside the current word - truncate the word at the cursor
221+ // (This emulates normal BASH completion behaviour I've observed, though I'm not entirely sure if it's useful)
222+ $ relativeOffset = $ this ->charIndex - $ token ['offset ' ];
223+ $ truncated = substr ($ token ['value ' ], 0 , $ relativeOffset );
224+
225+ $ this ->words [$ this ->wordIndex ] = $ truncated ;
226+ }
240227 }
241228 }
242229
243- if ($ this ->wordIndex > count ($ this ->words ) - 1 ) {
244- $ this ->wordIndex = count ($ this ->words ) - 1 ;
230+ // Cursor position is past the end of the command line string - consider it a new word
231+ if ($ this ->wordIndex === null ) {
232+ $ this ->wordIndex = count ($ this ->words );
233+ $ this ->words [] = '' ;
234+ }
235+ }
236+
237+ /**
238+ * Return a token's value with escaping and quotes removed
239+ *
240+ * @see self::tokenizeString()
241+ * @param array $token
242+ * @return string
243+ */
244+ protected function getTokenValue ($ token )
245+ {
246+ $ value = $ token ['value ' ];
247+
248+ // Remove outer quote characters (or first quote if unclosed)
249+ if ($ token ['type ' ] == 'quoted ' ) {
250+ $ value = preg_replace ('/^(?:[ \'"])(.*?)(?:[ \'"])?$/ ' , '$1 ' , $ value );
245251 }
252+
253+ // Remove escape characters
254+ $ value = preg_replace ('/ \\\\(.)/ ' , '$1 ' , $ value );
255+
256+ return $ value ;
257+ }
258+
259+ /**
260+ * Break a string into words, quoted strings and non-words (breaks)
261+ *
262+ * Returns an array of unmodified segments of $string with offset and type information.
263+ *
264+ * @param string $string
265+ * @return array as [ [type => string, value => string, offset => int], ... ]
266+ */
267+ protected function tokenizeString ($ string )
268+ {
269+ // Map capture groups to returned token type
270+ $ typeMap = array (
271+ 'double_quote_string ' => 'quoted ' ,
272+ 'single_quote_string ' => 'quoted ' ,
273+ 'word ' => 'word ' ,
274+ 'break ' => 'break ' ,
275+ );
276+
277+ // Escape every word break character including whitespace
278+ // preg_quote won't work here as it doesn't understand the ignore whitespace flag ("x")
279+ $ breaks = preg_replace ('/(.)/ ' , '\\\$1 ' , $ this ->wordBreaks );
280+
281+ $ pattern = <<<"REGEX"
282+ /(?:
283+ (?P<double_quote_string>
284+ "(\\\\.|[^ \"\\\\])*(?:"|$)
285+ ) |
286+ (?P<single_quote_string>
287+ '(\\\\.|[^'\\\\])*(?:'|$)
288+ ) |
289+ (?P<word>
290+ (?:\\\\.|[^ $ breaks])+
291+ ) |
292+ (?P<break>
293+ [ $ breaks]+
294+ )
295+ )/x
296+ REGEX ;
297+
298+ $ tokens = array ();
299+
300+ if (!preg_match_all ($ pattern , $ string , $ matches , PREG_OFFSET_CAPTURE | PREG_SET_ORDER )) {
301+ return $ tokens ;
302+ }
303+
304+ foreach ($ matches as $ set ) {
305+ foreach ($ set as $ groupName => $ match ) {
306+
307+ // Ignore integer indices preg_match outputs (duplicates of named groups)
308+ if (is_integer ($ groupName )) {
309+ continue ;
310+ }
311+
312+ // Skip if the offset indicates this group didn't match
313+ if ($ match [1 ] === -1 ) {
314+ continue ;
315+ }
316+
317+ $ tokens [] = array (
318+ 'type ' => $ typeMap [$ groupName ],
319+ 'value ' => $ match [0 ],
320+ 'offset ' => $ match [1 ],
321+ 'offsetEnd ' => $ match [1 ] + strlen ($ match [0 ])
322+ );
323+
324+ // Move to the next set (only one group should match per set)
325+ continue ;
326+ }
327+ }
328+
329+ return $ tokens ;
246330 }
247331
248332 /**
0 commit comments