Skip to content

Commit 591ab09

Browse files
committed
Fix some encoding issues and other stuff
1 parent 1b72238 commit 591ab09

File tree

15 files changed

+256
-165
lines changed

15 files changed

+256
-165
lines changed

System/FilePath/Internal.hs

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
{-# LANGUAGE PatternGuards #-}
2+
{-# LANGUAGE TypeApplications #-}
23

34
-- This template expects CPP definitions for:
45
-- MODULE_NAME = Posix | Windows
@@ -127,17 +128,25 @@ import Data.List(stripPrefix, isSuffixOf, uncons)
127128
#define FILEPATH FilePath
128129
#else
129130
import Prelude (fromIntegral)
130-
import System.OsPath.Encoding ( encodeWithTE )
131-
import GHC.IO.Encoding.Failure ( CodingFailureMode(..) )
131+
import Control.Exception ( SomeException, evaluate, try, displayException )
132+
import Data.Bifunctor (first)
133+
import Control.DeepSeq (force)
134+
import GHC.IO (unsafePerformIO)
132135
import qualified Data.Char as C
133136
#ifdef WINDOWS
137+
import GHC.IO.Encoding.Failure ( CodingFailureMode(..) )
138+
import qualified GHC.Foreign as GHC
134139
import GHC.IO.Encoding.UTF16 ( mkUTF16le )
140+
import qualified GHC.Foreign as GHC
135141
import Data.Word ( Word16 )
136142
import System.OsPath.Data.ByteString.Short.Word16
143+
import System.OsPath.Data.ByteString.Short ( packCStringLen )
137144
#define CHAR Word16
138145
#define STRING ShortByteString
139146
#define FILEPATH ShortByteString
140147
#else
148+
import GHC.IO.Encoding.Failure ( CodingFailureMode(..) )
149+
import qualified GHC.Foreign as GHC
141150
import GHC.IO.Encoding.UTF8 ( mkUTF8 )
142151
import Data.Word ( Word8 )
143152
import System.OsPath.Data.ByteString.Short
@@ -1180,10 +1189,14 @@ snoc str = \c -> str <> [c]
11801189
#else
11811190
#ifdef WINDOWS
11821191
fromString :: P.String -> STRING
1183-
fromString = P.either (P.error . P.show) P.id . encodeWithTE (mkUTF16le ErrorOnCodingFailure)
1192+
fromString str = P.either (P.error . P.show) P.id $ unsafePerformIO $ do
1193+
r <- try @SomeException $ GHC.withCStringLen (mkUTF16le ErrorOnCodingFailure) str $ \cstr -> packCStringLen cstr
1194+
evaluate $ force $ first displayException r
11841195
#else
11851196
fromString :: P.String -> STRING
1186-
fromString = P.either (P.error . P.show) P.id . encodeWithTE (mkUTF8 ErrorOnCodingFailure)
1197+
fromString str = P.either (P.error . P.show) P.id $ unsafePerformIO $ do
1198+
r <- try @SomeException $ GHC.withCStringLen (mkUTF8 ErrorOnCodingFailure) str $ \cstr -> packCStringLen cstr
1199+
evaluate $ force $ first displayException r
11871200
#endif
11881201

11891202
_a, _z, _A, _Z, _period, _quotedbl, _backslash, _slash, _question, _U, _N, _C, _colon, _semicolon, _US, _less, _greater, _bar, _asterisk, _nul, _space, _underscore :: CHAR

System/OsPath.hs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@
3030
-- are often interpreted as UTF8) as per the
3131
-- <https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_170 POSIX specification>
3232
-- and are passed as @char[]@ to syscalls. 'OsPath' maintains no invariant
33-
-- here. Some functions however, such as 'toOsPathUtf', may expect
34-
-- or produce UTF8.
33+
-- here.
3534
--
3635
-- Apart from encoding, filepaths have additional restrictions per platform:
3736
--
@@ -50,14 +49,13 @@
5049
-- It is advised to follow these principles when dealing with filepaths\/filenames:
5150
--
5251
-- 1. Avoid interpreting filenames that the OS returns, unless absolutely necessary.
53-
-- For example, the filepath separator is usually a predefined 'Word8', regardless of encoding.
52+
-- For example, the filepath separator is usually a predefined 'Word8'/'Word16', regardless of encoding.
5453
-- So even if we need to split filepaths, it might still not be necessary to understand the encoding
5554
-- of the filename.
5655
-- 2. When interpreting OS returned filenames consider that these might not be UTF8 on /unix/
57-
-- or at worst don't have an ASCII compatible encoding. Some strategies here involve looking
58-
-- up the current locale and using that for decoding ('fromOsPathFS' does this).
59-
-- Otherwise it can be reasonable to assume UTF8 on unix ('fromOsPathUtf' does that) if your application specifically
60-
-- mentions that it requires a UTF8 compatible system. If you know the encoding, you can just use 'fromOsPathEnc'.
56+
-- or at worst don't have an ASCII compatible encoding. The are 3 available strategies fer decoding/encoding:
57+
-- a) pick the best UTF (UTF-8 on unix, UTF-16LE on windows), b) decode with an explicitly defined 'TextEncoding',
58+
-- c) mimic the behavior of the @base@ library (permissive UTF16 on windows, current filesystem encoding on unix).
6159
-- 3. Avoid comparing @String@ based filepaths, because filenames of different encodings
6260
-- may have the same @String@ representation, although they're not the same byte-wise.
6361

System/OsPath/Common.hs

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
{-# LANGUAGE TypeApplications #-}
12
-- This template expects CPP definitions for:
23
--
34
-- WINDOWS defined? = no | yes | no
@@ -115,17 +116,30 @@ import System.OsString.Windows as PS
115116
, decodeWith
116117
, decodeFS
117118
, pack
118-
, pstr
119119
, encodeUtf
120120
, encodeWith
121121
, encodeFS
122122
, unpack
123123
)
124124
import Data.Bifunctor ( bimap )
125125
import qualified System.OsPath.Windows.Internal as C
126+
import GHC.IO.Encoding.UTF16 ( mkUTF16le )
127+
import Language.Haskell.TH.Quote
128+
( QuasiQuoter (..) )
129+
import Language.Haskell.TH.Syntax
130+
( Lift (..), lift )
131+
import GHC.IO.Encoding.Failure ( CodingFailureMode(..) )
132+
import Control.Monad ( when )
126133

127134
#elif defined(POSIX)
128-
135+
import GHC.IO.Encoding.Failure ( CodingFailureMode(..) )
136+
import Control.Monad ( when )
137+
import Language.Haskell.TH.Quote
138+
( QuasiQuoter (..) )
139+
import Language.Haskell.TH.Syntax
140+
( Lift (..), lift )
141+
142+
import GHC.IO.Encoding.UTF8 ( mkUTF8 )
129143
import System.OsPath.Types
130144
import System.OsString.Posix as PS
131145
( unsafeFromChar
@@ -134,7 +148,6 @@ import System.OsString.Posix as PS
134148
, decodeWith
135149
, decodeFS
136150
, pack
137-
, pstr
138151
, encodeUtf
139152
, encodeWith
140153
, encodeFS
@@ -158,7 +171,7 @@ import System.OsPath.Internal as PS
158171
)
159172
import System.OsPath.Types
160173
( OsPath )
161-
import System.OsString
174+
import System.OsString ( unsafeFromChar, toChar )
162175

163176
#if defined(mingw32_HOST_OS) || defined(__MINGW32__)
164177
import qualified System.OsPath.Windows as C

System/OsPath/Encoding.hs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@ module System.OsPath.Encoding
2121
, utf16le_b_encode
2222

2323
-- * base encoding
24-
, encodeWithTE
25-
, decodeWithTE
2624
, encodeWithBasePosix
2725
, decodeWithBasePosix
2826
, encodeWithBaseWindows

System/OsPath/Encoding/Internal.hs

Lines changed: 34 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ import Data.Bifunctor (first)
2828
import Data.Data (Typeable)
2929
import GHC.Show (Show (show))
3030
import Numeric (showHex)
31-
import Foreign.C (CString, CStringLen)
31+
import Foreign.C (CStringLen)
3232
import Data.Char (chr)
3333
import Foreign
3434
import Prelude (FilePath)
@@ -260,39 +260,19 @@ charsToCWchars = foldr (utf16Char . ord) []
260260
-- FFI
261261
--
262262

263-
-- | Marshal a Haskell string into a NUL terminated C wide string using
264-
-- temporary storage.
265-
--
266-
-- * the Haskell string may /not/ contain any NUL characters
267-
--
268-
-- * the memory is freed when the subcomputation terminates (either
269-
-- normally or via an exception), so the pointer to the temporary
270-
-- storage must /not/ be used after this.
271-
--
272-
withCWString :: String -> (Ptr Word16 -> IO a) -> IO a
273-
withCWString = withArray0 wNUL . charsToCWchars
263+
withFilePathWin :: FilePath -> (Int -> Ptr Word16 -> IO a) -> IO a
264+
withFilePathWin = withArrayLen . charsToCWchars
274265

275-
peekCWString :: Ptr Word16 -> IO String
276-
peekCWString cp = do
277-
cs <- peekArray0 wNUL cp
266+
peekFilePathWin :: (Ptr Word16, Int) -> IO FilePath
267+
peekFilePathWin (cp, l) = do
268+
cs <- peekArray l cp
278269
return (cWcharsToChars cs)
279270

280-
withFilePathWin :: FilePath -> (Ptr Word16 -> IO a) -> IO a
281-
withFilePathWin = withCWString
282-
283-
peekFilePathWin :: Ptr Word16 -> IO FilePath
284-
peekFilePathWin = peekCWString
271+
withFilePathPosix :: FilePath -> (CStringLen -> IO a) -> IO a
272+
withFilePathPosix fp f = getFileSystemEncoding >>= \enc -> GHC.withCStringLen enc fp f
285273

286-
withFilePathPosix :: FilePath -> (CString -> IO a) -> IO a
287-
withFilePathPosix fp f = getFileSystemEncoding >>= \enc -> GHC.withCString enc fp f
288-
289-
peekFilePathLenPosix :: CStringLen -> IO FilePath
290-
peekFilePathLenPosix fp = getFileSystemEncoding >>= \enc -> GHC.peekCStringLen enc fp
291-
292-
293-
-- -----------------------------------------------------------------------------
294-
-- Encoders / decoders
295-
--
274+
peekFilePathPosix :: CStringLen -> IO FilePath
275+
peekFilePathPosix fp = getFileSystemEncoding >>= \enc -> GHC.peekCStringLen enc fp
296276

297277
-- | Decode with the given 'TextEncoding'.
298278
decodeWithTE :: TextEncoding -> BS8.ShortByteString -> Either EncodingException String
@@ -306,21 +286,33 @@ encodeWithTE enc str = unsafePerformIO $ do
306286
r <- try @SomeException $ GHC.withCStringLen enc str $ \cstr -> BS8.packCStringLen cstr
307287
evaluate $ force $ first (flip EncodingError Nothing . displayException) r
308288

309-
-- | This mimics the filepath ddecoder base uses on unix.
289+
-- -----------------------------------------------------------------------------
290+
-- Encoders / decoders
291+
--
292+
293+
-- | This mimics the filepath decoder base uses on unix,
294+
-- with the small distinction that we're not truncating at NUL bytes (because we're not at
295+
-- the outer FFI layer).
310296
decodeWithBasePosix :: BS8.ShortByteString -> IO String
311-
decodeWithBasePosix ba = BS8.useAsCStringLen ba $ \fp -> peekFilePathLenPosix fp
297+
decodeWithBasePosix ba = BS8.useAsCStringLen ba $ \fp -> peekFilePathPosix fp
312298

313-
-- | This mimics the filepath dencoder base uses on unix.
299+
-- | This mimics the filepath dencoder base uses on unix,
300+
-- with the small distinction that we're not truncating at NUL bytes (because we're not at
301+
-- the outer FFI layer).
314302
encodeWithBasePosix :: String -> IO BS8.ShortByteString
315-
encodeWithBasePosix str = withFilePathPosix str $ \cstr -> BS8.packCString cstr
316-
317-
-- | This mimics the filepath decoder base uses on windows.
318-
decodeWithBaseWindows :: BS16.ShortByteString -> String
319-
decodeWithBaseWindows = cWcharsToChars . BS16.unpack
320-
321-
-- | This mimics the filepath dencoder base uses on windows.
322-
encodeWithBaseWindows :: String -> BS8.ShortByteString
323-
encodeWithBaseWindows = BS16.pack . charsToCWchars
303+
encodeWithBasePosix str = withFilePathPosix str $ \cstr -> BS8.packCStringLen cstr
304+
305+
-- | This mimics the filepath decoder base uses on windows,
306+
-- with the small distinction that we're not truncating at NUL bytes (because we're not at
307+
-- the outer FFI layer).
308+
decodeWithBaseWindows :: BS16.ShortByteString -> IO String
309+
decodeWithBaseWindows ba = BS16.useAsCWStringLen ba $ \fp -> peekFilePathWin fp
310+
311+
-- | This mimics the filepath dencoder base uses on windows,
312+
-- with the small distinction that we're not truncating at NUL bytes (because we're not at
313+
-- the outer FFI layer).
314+
encodeWithBaseWindows :: String -> IO BS16.ShortByteString
315+
encodeWithBaseWindows str = withFilePathWin str $ \l cstr -> BS16.packCWStringLen (cstr, l)
324316

325317

326318
-- -----------------------------------------------------------------------------

System/OsPath/Internal.hs

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,34 @@ module System.OsPath.Internal where
77
import {-# SOURCE #-} System.OsPath
88
( isValid )
99
import System.OsPath.Types
10-
import System.OsString.Internal hiding ( fromBytes )
1110
import qualified System.OsString.Internal as OS
1211

1312
import Control.Monad.Catch
1413
( MonadThrow )
1514
import Data.ByteString
1615
( ByteString )
17-
import Language.Haskell.TH
1816
import Language.Haskell.TH.Quote
1917
( QuasiQuoter (..) )
2018
import Language.Haskell.TH.Syntax
2119
( Lift (..), lift )
20+
import GHC.IO.Encoding.Failure ( CodingFailureMode(..) )
21+
22+
import System.OsString.Internal.Types
23+
#ifdef WINDOWS
24+
import qualified System.OsPath.Windows as PF
25+
import System.IO
26+
( TextEncoding, utf16le )
27+
import GHC.IO.Encoding.UTF16 ( mkUTF16le )
28+
import qualified System.OsPath.Data.ByteString.Short.Word16 as BS16
29+
import qualified System.OsPath.Data.ByteString.Short as BS8
30+
#else
31+
import qualified System.OsPath.Posix as PF
32+
import System.OsPath.Encoding
2233
import System.IO
2334
( TextEncoding )
24-
import System.OsPath.Encoding ( EncodingException(..) )
35+
import GHC.IO.Encoding.UTF8 ( mkUTF8 )
36+
import Control.Monad (when)
37+
#endif
2538

2639

2740

@@ -98,20 +111,37 @@ fromBytes :: MonadThrow m
98111
fromBytes = OS.fromBytes
99112

100113

101-
mkOsPath :: ByteString -> Q Exp
102-
mkOsPath bs =
103-
case fromBytes bs of
104-
Just afp' ->
105-
if isValid afp'
106-
then lift afp'
107-
else error "invalid filepath"
108-
Nothing -> error "invalid encoding"
109114

110115
-- | QuasiQuote an 'OsPath'. This accepts Unicode characters
111-
-- and encodes as UTF-8 on unix and UTF-16 on windows. Runs 'filepathIsValid'
116+
-- and encodes as UTF-8 on unix and UTF-16LE on windows. Runs 'isValid'
112117
-- on the input.
113118
osp :: QuasiQuoter
114-
osp = qq mkOsPath
119+
osp = QuasiQuoter
120+
#if defined(mingw32_HOST_OS) || defined(__MINGW32__)
121+
{ quoteExp = \s -> do
122+
osp' <- either (fail . show) (pure . OsString) . PF.encodeWith (mkUTF16le ErrorOnCodingFailure) $ s
123+
when (not $ isValid osp') $ fail ("filepath now valid: " <> show osp')
124+
lift osp'
125+
, quotePat = \_ ->
126+
fail "illegal QuasiQuote (allowed as expression only, used as a pattern)"
127+
, quoteType = \_ ->
128+
fail "illegal QuasiQuote (allowed as expression only, used as a type)"
129+
, quoteDec = \_ ->
130+
fail "illegal QuasiQuote (allowed as expression only, used as a declaration)"
131+
}
132+
#else
133+
{ quoteExp = \s -> do
134+
osp' <- either (fail . show) (pure . OsString) . PF.encodeWith (mkUTF8 ErrorOnCodingFailure) $ s
135+
when (not $ isValid osp') $ fail ("filepath now valid: " <> show osp')
136+
lift osp'
137+
, quotePat = \_ ->
138+
fail "illegal QuasiQuote (allowed as expression only, used as a pattern)"
139+
, quoteType = \_ ->
140+
fail "illegal QuasiQuote (allowed as expression only, used as a type)"
141+
, quoteDec = \_ ->
142+
fail "illegal QuasiQuote (allowed as expression only, used as a declaration)"
143+
}
144+
#endif
115145

116146

117147
-- | Unpack an 'OsPath' to a list of 'OsChar'.

System/OsPath/Posix.hs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,20 @@
88
#define WORD_NAME PosixChar
99

1010
#include "Common.hs"
11+
12+
-- | QuasiQuote a 'PosixPath'. This accepts Unicode characters
13+
-- and encodes as UTF-8. Runs 'isValid' on the input.
14+
pstr :: QuasiQuoter
15+
pstr =
16+
QuasiQuoter
17+
{ quoteExp = \s -> do
18+
ps <- either (fail . show) pure $ encodeWith (mkUTF8 ErrorOnCodingFailure) $ s
19+
when (not $ isValid ps) $ fail ("filepath now valid: " <> show ps)
20+
lift ps
21+
, quotePat = \_ ->
22+
fail "illegal QuasiQuote (allowed as expression only, used as a pattern)"
23+
, quoteType = \_ ->
24+
fail "illegal QuasiQuote (allowed as expression only, used as a type)"
25+
, quoteDec = \_ ->
26+
fail "illegal QuasiQuote (allowed as expression only, used as a declaration)"
27+
}

System/OsPath/Windows.hs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,21 @@
88
#define WORD_NAME WindowsChar
99

1010
#include "Common.hs"
11+
12+
13+
-- | QuasiQuote a 'WindowsPath'. This accepts Unicode characters
14+
-- and encodes as UTF-16LE. Runs 'isValid' on the input.
15+
pstr :: QuasiQuoter
16+
pstr =
17+
QuasiQuoter
18+
{ quoteExp = \s -> do
19+
ps <- either (fail . show) pure $ encodeWith (mkUTF16le ErrorOnCodingFailure) $ s
20+
when (not $ isValid ps) $ fail ("filepath now valid: " <> show ps)
21+
lift ps
22+
, quotePat = \_ ->
23+
fail "illegal QuasiQuote (allowed as expression only, used as a pattern)"
24+
, quoteType = \_ ->
25+
fail "illegal QuasiQuote (allowed as expression only, used as a type)"
26+
, quoteDec = \_ ->
27+
fail "illegal QuasiQuote (allowed as expression only, used as a declaration)"
28+
}

0 commit comments

Comments
 (0)