Skip to content

Commit 1b72238

Browse files
committed
Merge branch 'UTF-16LE_b'
2 parents 55ca988 + 6c605b0 commit 1b72238

File tree

16 files changed

+437
-297
lines changed

16 files changed

+437
-297
lines changed

Generate.hs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,12 @@ main = do
4848
,"import qualified System.OsPath.Windows as AFP_W"
4949
,"import qualified System.OsPath.Posix as AFP_P"
5050
#endif
51-
,"instance IsString WindowsString where fromString = WS . either (error . show) id . encodeWith (mkUTF16le TransliterateCodingFailure)"
52-
,"instance IsString PosixString where fromString = PS . either (error . show) id . encodeWith (mkUTF8 TransliterateCodingFailure)"
51+
,"instance IsString WindowsString where fromString = WS . either (error . show) id . encodeWithTE (mkUTF16le TransliterateCodingFailure)"
52+
,"instance IsString PosixString where fromString = PS . either (error . show) id . encodeWithTE (mkUTF8 TransliterateCodingFailure)"
5353
,"#if defined(mingw32_HOST_OS) || defined(__MINGW32__)"
54-
,"instance IsString OsString where fromString = OsString . WS . either (error . show) id . encodeWith (mkUTF16le TransliterateCodingFailure)"
54+
,"instance IsString OsString where fromString = OsString . WS . either (error . show) id . encodeWithTE (mkUTF16le TransliterateCodingFailure)"
5555
,"#else"
56-
,"instance IsString OsString where fromString = OsString . PS . either (error . show) id . encodeWith (mkUTF8 TransliterateCodingFailure)"
56+
,"instance IsString OsString where fromString = OsString . PS . either (error . show) id . encodeWithTE (mkUTF8 TransliterateCodingFailure)"
5757
,"#endif"
5858
,"tests :: [(String, Property)]"
5959
,"tests ="] ++

System/FilePath/Internal.hs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ import Data.List(stripPrefix, isSuffixOf, uncons)
127127
#define FILEPATH FilePath
128128
#else
129129
import Prelude (fromIntegral)
130-
import System.OsPath.Encoding ( encodeWith )
130+
import System.OsPath.Encoding ( encodeWithTE )
131131
import GHC.IO.Encoding.Failure ( CodingFailureMode(..) )
132132
import qualified Data.Char as C
133133
#ifdef WINDOWS
@@ -1180,10 +1180,10 @@ snoc str = \c -> str <> [c]
11801180
#else
11811181
#ifdef WINDOWS
11821182
fromString :: P.String -> STRING
1183-
fromString = P.either (P.error . P.show) P.id . encodeWith (mkUTF16le ErrorOnCodingFailure)
1183+
fromString = P.either (P.error . P.show) P.id . encodeWithTE (mkUTF16le ErrorOnCodingFailure)
11841184
#else
11851185
fromString :: P.String -> STRING
1186-
fromString = P.either (P.error . P.show) P.id . encodeWith (mkUTF8 ErrorOnCodingFailure)
1186+
fromString = P.either (P.error . P.show) P.id . encodeWithTE (mkUTF8 ErrorOnCodingFailure)
11871187
#endif
11881188

11891189
_a, _z, _A, _Z, _period, _quotedbl, _backslash, _slash, _question, _U, _N, _C, _colon, _semicolon, _US, _less, _greater, _bar, _asterisk, _nul, _space, _underscore :: CHAR

System/OsPath/Common.hs

Lines changed: 35 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -40,32 +40,21 @@ module System.OsPath
4040
, OsChar
4141
#endif
4242
-- * Filepath construction
43+
, PS.encodeUtf
44+
, PS.encodeWith
45+
, PS.encodeFS
4346
#if defined(WINDOWS) || defined(POSIX)
44-
, toPlatformStringUtf
45-
, toPlatformStringEnc
46-
, toPlatformStringFS
4747
, pstr
48-
, packPlatformString
4948
#else
50-
, toOsPathUtf
51-
, toOsPathEnc
52-
, toOsPathFS
5349
, osp
54-
, packOsPath
5550
#endif
51+
, PS.pack
5652

5753
-- * Filepath deconstruction
58-
#if defined(WINDOWS) || defined(POSIX)
59-
, fromPlatformStringUtf
60-
, fromPlatformStringEnc
61-
, fromPlatformStringFS
62-
, unpackPlatformString
63-
#else
64-
, fromOsPathUtf
65-
, fromOsPathEnc
66-
, fromOsPathFS
67-
, unpackOsPath
68-
#endif
54+
, PS.decodeUtf
55+
, PS.decodeWith
56+
, PS.decodeFS
57+
, PS.unpack
6958

7059
-- * Word construction
7160
, unsafeFromChar
@@ -119,53 +108,53 @@ where
119108

120109
#ifdef WINDOWS
121110
import System.OsPath.Types
122-
import System.OsString.Windows
111+
import System.OsString.Windows as PS
123112
( unsafeFromChar
124113
, toChar
125-
, fromPlatformStringUtf
126-
, fromPlatformStringEnc
127-
, fromPlatformStringFS
128-
, packPlatformString
114+
, decodeUtf
115+
, decodeWith
116+
, decodeFS
117+
, pack
129118
, pstr
130-
, toPlatformStringUtf
131-
, toPlatformStringEnc
132-
, toPlatformStringFS
133-
, unpackPlatformString
119+
, encodeUtf
120+
, encodeWith
121+
, encodeFS
122+
, unpack
134123
)
135124
import Data.Bifunctor ( bimap )
136125
import qualified System.OsPath.Windows.Internal as C
137126

138127
#elif defined(POSIX)
139128

140129
import System.OsPath.Types
141-
import System.OsString.Posix
130+
import System.OsString.Posix as PS
142131
( unsafeFromChar
143132
, toChar
144-
, fromPlatformStringUtf
145-
, fromPlatformStringEnc
146-
, fromPlatformStringFS
147-
, packPlatformString
133+
, decodeUtf
134+
, decodeWith
135+
, decodeFS
136+
, pack
148137
, pstr
149-
, toPlatformStringUtf
150-
, toPlatformStringEnc
151-
, toPlatformStringFS
152-
, unpackPlatformString
138+
, encodeUtf
139+
, encodeWith
140+
, encodeFS
141+
, unpack
153142
)
154143
import Data.Bifunctor ( bimap )
155144
import qualified System.OsPath.Posix.Internal as C
156145

157146
#else
158147

159-
import System.OsPath.Internal
148+
import System.OsPath.Internal as PS
160149
( osp
161-
, fromOsPathUtf
162-
, fromOsPathEnc
163-
, fromOsPathFS
164-
, packOsPath
165-
, toOsPathUtf
166-
, toOsPathEnc
167-
, toOsPathFS
168-
, unpackOsPath
150+
, decodeUtf
151+
, decodeWith
152+
, decodeFS
153+
, pack
154+
, encodeUtf
155+
, encodeWith
156+
, encodeFS
157+
, unpack
169158
)
170159
import System.OsPath.Types
171160
( OsPath )

System/OsPath/Encoding.hs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,17 @@ module System.OsPath.Encoding
1212
, ucs2le_decode
1313
, ucs2le_encode
1414

15+
-- * UTF-16LE_b
16+
, utf16le_b
17+
, mkUTF16le_b
18+
, utf16le_b_DF
19+
, utf16le_b_EF
20+
, utf16le_b_decode
21+
, utf16le_b_encode
22+
1523
-- * base encoding
16-
, encodeWith
17-
, decodeWith
24+
, encodeWithTE
25+
, decodeWithTE
1826
, encodeWithBasePosix
1927
, decodeWithBasePosix
2028
, encodeWithBaseWindows

System/OsPath/Encoding/Internal.hs

Lines changed: 114 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{-# LANGUAGE NoImplicitPrelude
22
, BangPatterns
33
, TypeApplications
4+
, MultiWayIf
45
#-}
56
{-# OPTIONS_GHC -funbox-strict-fields #-}
67

@@ -114,11 +115,120 @@ ucs2le_encode
114115
in
115116
loop ir0 ow0
116117

118+
-- -----------------------------------------------------------------------------
119+
-- UTF-16b
120+
--
121+
122+
-- | Mimics the base encoding for filesystem operations. This should be total on all inputs (word16 byte arrays).
123+
--
124+
-- Note that this has a subtle difference to 'encodeWithBaseWindows'/'decodeWithBaseWindows': it doesn't care for
125+
-- the @0x0000@ end marker and will as such produce different results. Use @takeWhile (/= '\NUL')@ on the input
126+
-- to recover this behavior.
127+
utf16le_b :: TextEncoding
128+
utf16le_b = mkUTF16le_b ErrorOnCodingFailure
129+
130+
mkUTF16le_b :: CodingFailureMode -> TextEncoding
131+
mkUTF16le_b cfm = TextEncoding { textEncodingName = "UTF-16LE_b",
132+
mkTextDecoder = utf16le_b_DF cfm,
133+
mkTextEncoder = utf16le_b_EF cfm }
134+
135+
utf16le_b_DF :: CodingFailureMode -> IO (TextDecoder ())
136+
utf16le_b_DF cfm =
137+
return (BufferCodec {
138+
encode = utf16le_b_decode,
139+
recover = recoverDecode cfm,
140+
close = return (),
141+
getState = return (),
142+
setState = const $ return ()
143+
})
144+
145+
utf16le_b_EF :: CodingFailureMode -> IO (TextEncoder ())
146+
utf16le_b_EF cfm =
147+
return (BufferCodec {
148+
encode = utf16le_b_encode,
149+
recover = recoverEncode cfm,
150+
close = return (),
151+
getState = return (),
152+
setState = const $ return ()
153+
})
154+
155+
156+
utf16le_b_decode :: DecodeBuffer
157+
utf16le_b_decode
158+
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
159+
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
160+
= let
161+
loop !ir !ow
162+
| ow >= os = done OutputUnderflow ir ow
163+
| ir >= iw = done InputUnderflow ir ow
164+
| ir + 1 == iw = done InputUnderflow ir ow
165+
| otherwise = do
166+
c0 <- readWord8Buf iraw ir
167+
c1 <- readWord8Buf iraw (ir+1)
168+
let x1 = fromIntegral c1 `shiftL` 8 + fromIntegral c0
169+
if | iw - ir >= 4 -> do
170+
c2 <- readWord8Buf iraw (ir+2)
171+
c3 <- readWord8Buf iraw (ir+3)
172+
let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2
173+
if | 0xd800 <= x1 && x1 <= 0xdbff
174+
, 0xdc00 <= x2 && x2 <= 0xdfff -> do
175+
ow' <- writeCharBuf oraw ow (unsafeChr ((x1 - 0xd800)*0x400 + (x2 - 0xdc00) + 0x10000))
176+
loop (ir+4) ow'
177+
| otherwise -> do
178+
ow' <- writeCharBuf oraw ow (unsafeChr x1)
179+
loop (ir+2) ow'
180+
| iw - ir >= 2 -> do
181+
ow' <- writeCharBuf oraw ow (unsafeChr x1)
182+
loop (ir+2) ow'
183+
| otherwise -> done InputUnderflow ir ow
184+
185+
-- lambda-lifted, to avoid thunks being built in the inner-loop:
186+
done why !ir !ow = return (why,
187+
if ir == iw then input{ bufL=0, bufR=0 }
188+
else input{ bufL=ir },
189+
output{ bufR=ow })
190+
in
191+
loop ir0 ow0
192+
193+
194+
utf16le_b_encode :: EncodeBuffer
195+
utf16le_b_encode
196+
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
197+
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
198+
= let
199+
done why !ir !ow = return (why,
200+
if ir == iw then input{ bufL=0, bufR=0 }
201+
else input{ bufL=ir },
202+
output{ bufR=ow })
203+
loop !ir !ow
204+
| ir >= iw = done InputUnderflow ir ow
205+
| os - ow < 2 = done OutputUnderflow ir ow
206+
| otherwise = do
207+
(c,ir') <- readCharBuf iraw ir
208+
case ord c of
209+
x | x < 0x10000 -> do
210+
writeWord8Buf oraw ow (fromIntegral x)
211+
writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))
212+
loop ir' (ow+2)
213+
| otherwise ->
214+
if os - ow < 4 then done OutputUnderflow ir ow else do
215+
let x' = x - 0x10000
216+
w1 = x' `div` 0x400 + 0xd800
217+
w2 = x' `mod` 0x400 + 0xdc00
218+
writeWord8Buf oraw ow (fromIntegral w1)
219+
writeWord8Buf oraw (ow+1) (fromIntegral (w1 `shiftR` 8))
220+
writeWord8Buf oraw (ow+2) (fromIntegral w2)
221+
writeWord8Buf oraw (ow+3) (fromIntegral (w2 `shiftR` 8))
222+
loop ir' (ow+4)
223+
in
224+
loop ir0 ow0
117225

118226
-- -----------------------------------------------------------------------------
119227
-- Windows encoding (ripped off from base)
120228
--
121229

230+
cWcharsToChars_UCS2 :: [Word16] -> [Char]
231+
cWcharsToChars_UCS2 = map (chr . fromIntegral)
122232

123233

124234
-- On Windows, wchar_t is 16 bits wide and CWString uses the UTF-16 encoding.
@@ -185,14 +295,14 @@ peekFilePathLenPosix fp = getFileSystemEncoding >>= \enc -> GHC.peekCStringLen e
185295
--
186296

187297
-- | Decode with the given 'TextEncoding'.
188-
decodeWith :: TextEncoding -> BS8.ShortByteString -> Either EncodingException String
189-
decodeWith enc ba = unsafePerformIO $ do
298+
decodeWithTE :: TextEncoding -> BS8.ShortByteString -> Either EncodingException String
299+
decodeWithTE enc ba = unsafePerformIO $ do
190300
r <- try @SomeException $ BS8.useAsCStringLen ba $ \fp -> GHC.peekCStringLen enc fp
191301
evaluate $ force $ first (flip EncodingError Nothing . displayException) r
192302

193303
-- | Encode with the given 'TextEncoding'.
194-
encodeWith :: TextEncoding -> String -> Either EncodingException BS8.ShortByteString
195-
encodeWith enc str = unsafePerformIO $ do
304+
encodeWithTE :: TextEncoding -> String -> Either EncodingException BS8.ShortByteString
305+
encodeWithTE enc str = unsafePerformIO $ do
196306
r <- try @SomeException $ GHC.withCStringLen enc str $ \cstr -> BS8.packCStringLen cstr
197307
evaluate $ force $ first (flip EncodingError Nothing . displayException) r
198308

0 commit comments

Comments
 (0)