Skip to content

Commit 6c605b0

Browse files
committed
Add UTF-16LE_b encoding mimicing base
1 parent 664b776 commit 6c605b0

File tree

2 files changed

+116
-0
lines changed

2 files changed

+116
-0
lines changed

System/OsPath/Encoding.hs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@ module System.OsPath.Encoding
1212
, ucs2le_decode
1313
, ucs2le_encode
1414

15+
-- * UTF-16LE_b
16+
, utf16le_b
17+
, mkUTF16le_b
18+
, utf16le_b_DF
19+
, utf16le_b_EF
20+
, utf16le_b_decode
21+
, utf16le_b_encode
22+
1523
-- * base encoding
1624
, encodeWithTE
1725
, decodeWithTE

System/OsPath/Encoding/Internal.hs

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{-# LANGUAGE NoImplicitPrelude
22
, BangPatterns
33
, TypeApplications
4+
, MultiWayIf
45
#-}
56
{-# OPTIONS_GHC -funbox-strict-fields #-}
67

@@ -114,6 +115,113 @@ ucs2le_encode
114115
in
115116
loop ir0 ow0
116117

118+
-- -----------------------------------------------------------------------------
119+
-- UTF-16b
120+
--
121+
122+
-- | Mimics the base encoding for filesystem operations. This should be total on all inputs (word16 byte arrays).
123+
--
124+
-- Note that this has a subtle difference to 'encodeWithBaseWindows'/'decodeWithBaseWindows': it doesn't care for
125+
-- the @0x0000@ end marker and will as such produce different results. Use @takeWhile (/= '\NUL')@ on the input
126+
-- to recover this behavior.
127+
utf16le_b :: TextEncoding
128+
utf16le_b = mkUTF16le_b ErrorOnCodingFailure
129+
130+
mkUTF16le_b :: CodingFailureMode -> TextEncoding
131+
mkUTF16le_b cfm = TextEncoding { textEncodingName = "UTF-16LE_b",
132+
mkTextDecoder = utf16le_b_DF cfm,
133+
mkTextEncoder = utf16le_b_EF cfm }
134+
135+
utf16le_b_DF :: CodingFailureMode -> IO (TextDecoder ())
136+
utf16le_b_DF cfm =
137+
return (BufferCodec {
138+
encode = utf16le_b_decode,
139+
recover = recoverDecode cfm,
140+
close = return (),
141+
getState = return (),
142+
setState = const $ return ()
143+
})
144+
145+
utf16le_b_EF :: CodingFailureMode -> IO (TextEncoder ())
146+
utf16le_b_EF cfm =
147+
return (BufferCodec {
148+
encode = utf16le_b_encode,
149+
recover = recoverEncode cfm,
150+
close = return (),
151+
getState = return (),
152+
setState = const $ return ()
153+
})
154+
155+
156+
utf16le_b_decode :: DecodeBuffer
157+
utf16le_b_decode
158+
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
159+
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
160+
= let
161+
loop !ir !ow
162+
| ow >= os = done OutputUnderflow ir ow
163+
| ir >= iw = done InputUnderflow ir ow
164+
| ir + 1 == iw = done InputUnderflow ir ow
165+
| otherwise = do
166+
c0 <- readWord8Buf iraw ir
167+
c1 <- readWord8Buf iraw (ir+1)
168+
let x1 = fromIntegral c1 `shiftL` 8 + fromIntegral c0
169+
if | iw - ir >= 4 -> do
170+
c2 <- readWord8Buf iraw (ir+2)
171+
c3 <- readWord8Buf iraw (ir+3)
172+
let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2
173+
if | 0xd800 <= x1 && x1 <= 0xdbff
174+
, 0xdc00 <= x2 && x2 <= 0xdfff -> do
175+
ow' <- writeCharBuf oraw ow (unsafeChr ((x1 - 0xd800)*0x400 + (x2 - 0xdc00) + 0x10000))
176+
loop (ir+4) ow'
177+
| otherwise -> do
178+
ow' <- writeCharBuf oraw ow (unsafeChr x1)
179+
loop (ir+2) ow'
180+
| iw - ir >= 2 -> do
181+
ow' <- writeCharBuf oraw ow (unsafeChr x1)
182+
loop (ir+2) ow'
183+
| otherwise -> done InputUnderflow ir ow
184+
185+
-- lambda-lifted, to avoid thunks being built in the inner-loop:
186+
done why !ir !ow = return (why,
187+
if ir == iw then input{ bufL=0, bufR=0 }
188+
else input{ bufL=ir },
189+
output{ bufR=ow })
190+
in
191+
loop ir0 ow0
192+
193+
194+
utf16le_b_encode :: EncodeBuffer
195+
utf16le_b_encode
196+
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
197+
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
198+
= let
199+
done why !ir !ow = return (why,
200+
if ir == iw then input{ bufL=0, bufR=0 }
201+
else input{ bufL=ir },
202+
output{ bufR=ow })
203+
loop !ir !ow
204+
| ir >= iw = done InputUnderflow ir ow
205+
| os - ow < 2 = done OutputUnderflow ir ow
206+
| otherwise = do
207+
(c,ir') <- readCharBuf iraw ir
208+
case ord c of
209+
x | x < 0x10000 -> do
210+
writeWord8Buf oraw ow (fromIntegral x)
211+
writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))
212+
loop ir' (ow+2)
213+
| otherwise ->
214+
if os - ow < 4 then done OutputUnderflow ir ow else do
215+
let x' = x - 0x10000
216+
w1 = x' `div` 0x400 + 0xd800
217+
w2 = x' `mod` 0x400 + 0xdc00
218+
writeWord8Buf oraw ow (fromIntegral w1)
219+
writeWord8Buf oraw (ow+1) (fromIntegral (w1 `shiftR` 8))
220+
writeWord8Buf oraw (ow+2) (fromIntegral w2)
221+
writeWord8Buf oraw (ow+3) (fromIntegral (w2 `shiftR` 8))
222+
loop ir' (ow+4)
223+
in
224+
loop ir0 ow0
117225

118226
-- -----------------------------------------------------------------------------
119227
-- Windows encoding (ripped off from base)

0 commit comments

Comments
 (0)