Skip to content

Commit d3e54db

Browse files
authored
Merge pull request #407 from michaelpj/mpj/code-point-code-unit
Add functions to convert between code-point and code-unit positions
2 parents 8b63438 + d1f6ef2 commit d3e54db

File tree

4 files changed

+185
-3
lines changed

4 files changed

+185
-3
lines changed

lsp-types/src/Language/LSP/Types/Location.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import Language.LSP.Types.Utils
1111

1212
-- ---------------------------------------------------------------------
1313

14+
-- | A position in a document. Note that the character offsets in a line
15+
-- are given in UTF-16 code units, *not* Unicode code points.
1416
data Position =
1517
Position
1618
{ -- | Line position in a document (zero-based).

lsp/example/Reactor.hs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -282,15 +282,15 @@ handle logger = mconcat
282282
doc = params ^. J.textDocument
283283
(J.List diags) = params ^. J.context . J.diagnostics
284284
-- makeCommand only generates commands for diagnostics whose source is us
285-
makeCommand (J.Diagnostic (J.Range start _) _s _c (Just "lsp-hello") _m _t _l) = [J.Command title cmd cmdparams]
285+
makeCommand (J.Diagnostic (J.Range s _) _s _c (Just "lsp-hello") _m _t _l) = [J.Command title cmd cmdparams]
286286
where
287287
title = "Apply LSP hello command:" <> head (T.lines _m)
288288
-- NOTE: the cmd needs to be registered via the InitializeResponse message. See lspOptions above
289289
cmd = "lsp-hello-command"
290290
-- need 'file' and 'start_pos'
291291
args = J.List
292292
[ J.object [("file", J.object [("textDocument",J.toJSON doc)])]
293-
, J.object [("start_pos",J.object [("position", J.toJSON start)])]
293+
, J.object [("start_pos",J.object [("position", J.toJSON s)])]
294294
]
295295
cmdparams = Just args
296296
makeCommand (J.Diagnostic _r _s _c _source _m _t _l) = []

lsp/src/Language/LSP/VFS.hs

Lines changed: 141 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,18 @@ module Language.LSP.VFS
4444
, persistFileVFS
4545
, closeVFS
4646

47+
-- * Positions and transformations
48+
, CodePointPosition (..)
49+
, line
50+
, character
51+
, codePointPositionToPosition
52+
, positionToCodePointPosition
53+
, CodePointRange (..)
54+
, start
55+
, end
56+
, codePointRangeToRange
57+
, rangeToCodePointRange
58+
4759
-- * manipulating the file contents
4860
, rangeLinesFromVfs
4961
, PosPrefixInfo(..)
@@ -69,9 +81,10 @@ import Data.Ord
6981
import qualified Data.HashMap.Strict as HashMap
7082
import qualified Data.Map.Strict as Map
7183
import Data.Maybe
84+
import qualified Data.Text.Rope as URope
7285
import Data.Text.Utf16.Rope ( Rope )
7386
import qualified Data.Text.Utf16.Rope as Rope
74-
import Data.Text.Prettyprint.Doc
87+
import Data.Text.Prettyprint.Doc hiding (line)
7588
import qualified Language.LSP.Types as J
7689
import qualified Language.LSP.Types.Lens as J
7790
import System.FilePath
@@ -346,6 +359,133 @@ changeChars logger str start finish new = do
346359

347360
-- ---------------------------------------------------------------------
348361

362+
-- | A position, like a 'J.Position', but where the offsets in the line are measured in
363+
-- Unicode code points instead of UTF-16 code units.
364+
data CodePointPosition =
365+
CodePointPosition
366+
{ -- | Line position in a document (zero-based).
367+
_line :: J.UInt
368+
-- | Character offset on a line in a document in *code points* (zero-based).
369+
, _character :: J.UInt
370+
} deriving (Show, Read, Eq, Ord)
371+
372+
-- | A range, like a 'J.Range', but where the offsets in the line are measured in
373+
-- Unicode code points instead of UTF-16 code units.
374+
data CodePointRange =
375+
CodePointRange
376+
{ _start :: CodePointPosition -- ^ The range's start position.
377+
, _end :: CodePointPosition -- ^ The range's end position.
378+
} deriving (Show, Read, Eq, Ord)
379+
380+
makeFieldsNoPrefix ''CodePointPosition
381+
makeFieldsNoPrefix ''CodePointRange
382+
383+
{- Note [Converting between code points and code units]
384+
This is inherently a somewhat expensive operation, but we take some care to minimize the cost.
385+
In particular, we use the good asymptotics of 'Rope' to our advantage:
386+
- We extract the single line that we are interested in in time logarithmic in the number of lines.
387+
- We then split the line at the given position, and check how long the prefix is, which takes
388+
linear time in the length of the (single) line.
389+
390+
We also may need to convert the line back and forth between ropes with different indexing. Again
391+
this is linear time in the length of the line.
392+
393+
So the overall process is logarithmic in the number of lines, and linear in the length of the specific
394+
line. Which is okay-ish, so long as we don't have very long lines.
395+
-}
396+
397+
-- | Extracts a specific line from a 'Rope.Rope'.
398+
-- Logarithmic in the number of lines.
399+
extractLine :: Rope.Rope -> Word -> Maybe Rope.Rope
400+
extractLine rope l = do
401+
-- Check for the line being out of bounds
402+
let lastLine = Rope.posLine $ Rope.lengthAsPosition rope
403+
guard $ l <= lastLine
404+
405+
let (_, suffix) = Rope.splitAtLine l rope
406+
(prefix, _) = Rope.splitAtLine 1 suffix
407+
pure prefix
408+
409+
-- | Translate a code-point offset into a code-unit offset.
410+
-- Linear in the length of the rope.
411+
codePointOffsetToCodeUnitOffset :: URope.Rope -> Word -> Maybe Word
412+
codePointOffsetToCodeUnitOffset rope offset = do
413+
-- Check for the position being out of bounds
414+
guard $ offset <= URope.length rope
415+
-- Split at the given position in *code points*
416+
let (prefix, _) = URope.splitAt offset rope
417+
-- Convert the prefix to a rope using *code units*
418+
utf16Prefix = Rope.fromText $ URope.toText prefix
419+
-- Get the length of the prefix in *code units*
420+
pure $ Rope.length utf16Prefix
421+
422+
-- | Translate a UTF-16 code-unit offset into a code-point offset.
423+
-- Linear in the length of the rope.
424+
codeUnitOffsetToCodePointOffset :: Rope.Rope -> Word -> Maybe Word
425+
codeUnitOffsetToCodePointOffset rope offset = do
426+
-- Check for the position being out of bounds
427+
guard $ offset <= Rope.length rope
428+
-- Split at the given position in *code units*
429+
(prefix, _) <- Rope.splitAt offset rope
430+
-- Convert the prefixto a rope using *code points*
431+
let utfPrefix = URope.fromText $ Rope.toText prefix
432+
-- Get the length of the prefix in *code points*
433+
pure $ URope.length utfPrefix
434+
435+
-- | Given a virtual file, translate a 'CodePointPosition' in that file into a 'J.Position' in that file.
436+
--
437+
-- Will return 'Nothing' if the requested position is out of bounds of the document.
438+
--
439+
-- Logarithmic in the number of lines in the document, and linear in the length of the line containing
440+
-- the position.
441+
codePointPositionToPosition :: VirtualFile -> CodePointPosition -> Maybe J.Position
442+
codePointPositionToPosition vFile (CodePointPosition l cpc) = do
443+
-- See Note [Converting between code points and code units]
444+
let text = _file_text vFile
445+
utf16Line <- extractLine text (fromIntegral l)
446+
-- Convert the line a rope using *code points*
447+
let utfLine = URope.fromText $ Rope.toText utf16Line
448+
449+
cuc <- codePointOffsetToCodeUnitOffset utfLine (fromIntegral cpc)
450+
pure $ J.Position l (fromIntegral cuc)
451+
452+
-- | Given a virtual file, translate a 'CodePointRange' in that file into a 'J.Range' in that file.
453+
--
454+
-- Will return 'Nothing' if any of the positions are out of bounds of the document.
455+
--
456+
-- Logarithmic in the number of lines in the document, and linear in the length of the lines containing
457+
-- the positions.
458+
codePointRangeToRange :: VirtualFile -> CodePointRange -> Maybe J.Range
459+
codePointRangeToRange vFile (CodePointRange b e) =
460+
J.Range <$> codePointPositionToPosition vFile b <*> codePointPositionToPosition vFile e
461+
462+
-- | Given a virtual file, translate a 'J.Position' in that file into a 'CodePointPosition' in that file.
463+
--
464+
-- Will return 'Nothing' if the requested position lies inside a code point, or if it is out of bounds of the document.
465+
--
466+
-- Logarithmic in the number of lines in the document, and linear in the length of the line containing
467+
-- the position.
468+
positionToCodePointPosition :: VirtualFile -> J.Position -> Maybe CodePointPosition
469+
positionToCodePointPosition vFile (J.Position l cuc) = do
470+
-- See Note [Converting between code points and code units]
471+
let text = _file_text vFile
472+
utf16Line <- extractLine text (fromIntegral l)
473+
474+
cpc <- codeUnitOffsetToCodePointOffset utf16Line (fromIntegral cuc)
475+
pure $ CodePointPosition l (fromIntegral cpc)
476+
477+
-- | Given a virtual file, translate a 'J.Range' in that file into a 'CodePointRange' in that file.
478+
--
479+
-- Will return 'Nothing' if any of the positions are out of bounds of the document.
480+
--
481+
-- Logarithmic in the number of lines in the document, and linear in the length of the lines containing
482+
-- the positions.
483+
rangeToCodePointRange :: VirtualFile -> J.Range -> Maybe CodePointRange
484+
rangeToCodePointRange vFile (J.Range b e) =
485+
CodePointRange <$> positionToCodePointPosition vFile b <*> positionToCodePointPosition vFile e
486+
487+
-- ---------------------------------------------------------------------
488+
349489
-- TODO:AZ:move this to somewhere sane
350490
-- | Describes the line at the current cursor position
351491
data PosPrefixInfo = PosPrefixInfo

lsp/test/VspSpec.hs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,46 @@ vspSpec = do
299299
, " putStrLn \"hello world\""
300300
]
301301

302+
it "converts code units to code points" $ do
303+
let
304+
orig = unlines
305+
[ "a𐐀b"
306+
, "a𐐀b"
307+
]
308+
vfile = VirtualFile 0 0 (fromString orig)
309+
310+
positionToCodePointPosition vfile (J.Position 1 0) `shouldBe` Just (CodePointPosition 1 0)
311+
positionToCodePointPosition vfile (J.Position 1 1) `shouldBe` Just (CodePointPosition 1 1)
312+
-- Split inside code point
313+
positionToCodePointPosition vfile (J.Position 1 2) `shouldBe` Nothing
314+
positionToCodePointPosition vfile (J.Position 1 3) `shouldBe` Just (CodePointPosition 1 2)
315+
positionToCodePointPosition vfile (J.Position 1 4) `shouldBe` Just (CodePointPosition 1 3)
316+
positionToCodePointPosition vfile (J.Position 1 5) `shouldBe` Just (CodePointPosition 1 4)
317+
-- Greater column than max column
318+
positionToCodePointPosition vfile (J.Position 1 6) `shouldBe` Nothing
319+
positionToCodePointPosition vfile (J.Position 2 1) `shouldBe` Nothing
320+
-- Greater line than max line
321+
positionToCodePointPosition vfile (J.Position 3 0) `shouldBe` Nothing
322+
323+
it "converts code points to code units" $ do
324+
let
325+
orig = unlines
326+
[ "a𐐀b"
327+
, "a𐐀b"
328+
]
329+
vfile = VirtualFile 0 0 (fromString orig)
330+
331+
codePointPositionToPosition vfile (CodePointPosition 1 0) `shouldBe` Just (J.Position 1 0)
332+
codePointPositionToPosition vfile (CodePointPosition 1 1) `shouldBe` Just (J.Position 1 1)
333+
codePointPositionToPosition vfile (CodePointPosition 1 2) `shouldBe` Just (J.Position 1 3)
334+
codePointPositionToPosition vfile (CodePointPosition 1 3) `shouldBe` Just (J.Position 1 4)
335+
codePointPositionToPosition vfile (CodePointPosition 1 4) `shouldBe` Just (J.Position 1 5)
336+
-- Greater column than max column
337+
codePointPositionToPosition vfile (CodePointPosition 1 5) `shouldBe` Nothing
338+
codePointPositionToPosition vfile (CodePointPosition 2 1) `shouldBe` Nothing
339+
-- Greater line than max line
340+
codePointPositionToPosition vfile (CodePointPosition 3 0) `shouldBe` Nothing
341+
302342
-- ---------------------------------
303343

304344
it "getCompletionPrefix" $ do

0 commit comments

Comments
 (0)