Skip to content

Commit 0a6f617

Browse files
Hook path validations in fromChunk and fromChars
1 parent 0125c61 commit 0a6f617

File tree

3 files changed

+176
-113
lines changed

3 files changed

+176
-113
lines changed

core/src/Streamly/Internal/FileSystem/Path/Common.hs

Lines changed: 153 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ module Streamly.Internal.FileSystem.Path.Common
1212

1313
-- * Construction
1414
, isValid
15+
, validatePath
16+
, validateFile
1517
, fromChunk
1618
, unsafeFromChunk
1719
, fromChars
@@ -32,7 +34,6 @@ module Streamly.Internal.FileSystem.Path.Common
3234
, hasTrailingSeparator
3335
, isBranch
3436
, isRooted
35-
, maybeFile
3637
, isAbsolute
3738
, isRootRelative
3839
, isRelativeWithDrive
@@ -84,7 +85,7 @@ where
8485
import Control.Monad (when)
8586
import Control.Monad.Catch (MonadThrow(..))
8687
import Control.Monad.IO.Class (MonadIO(..))
87-
import Data.Char (ord, isAlpha, toUpper)
88+
import Data.Char (chr, ord, isAlpha, toUpper)
8889
import Data.Function ((&))
8990
import Data.Functor.Identity (Identity(..))
9091
#ifdef DEBUG
@@ -189,8 +190,8 @@ primarySeparator Windows = windowsSeparator
189190
-- @/@ or @\\@.
190191
{-# INLINE isSeparator #-}
191192
isSeparator :: OS -> Char -> Bool
192-
isSeparator Windows c = (c == windowsSeparator) || (c == posixSeparator)
193193
isSeparator Posix c = c == posixSeparator
194+
isSeparator Windows c = (c == windowsSeparator) || (c == posixSeparator)
194195

195196
{-# INLINE isSeparatorWord #-}
196197
isSeparatorWord :: Integral a => OS -> a -> Bool
@@ -625,6 +626,9 @@ unsafeSplitUNC arr =
625626
--
626627
{-# INLINE splitRoot #-}
627628
splitRoot :: (Unbox a, Integral a) => OS -> Array a -> (Array a, Array a)
629+
-- NOTE: validatePath depends on splitRoot splitting the path without removing
630+
-- any redundant chars etc. It should just split and do nothing else.
631+
-- XXX We can put an assert here "arrLen == rootLen + stemLen".
628632
splitRoot Posix arr
629633
| isRooted Posix arr
630634
= unsafeSplitTopLevel Posix arr
@@ -898,8 +902,8 @@ splitTail _os _arr = undefined
898902

899903
-- | Returns () if the path can be a valid file, otherwise throws an
900904
-- exception.
901-
maybeFile :: (MonadThrow m, Unbox a, Integral a) => OS -> Array a -> m ()
902-
maybeFile os arr = do
905+
validateFile :: (MonadThrow m, Unbox a, Integral a) => OS -> Array a -> m ()
906+
validateFile os arr = do
903907
s1 <-
904908
Stream.toList
905909
$ Stream.take 3
@@ -1332,7 +1336,7 @@ splitAllExtensions = splitAllExtensionsBy False extensionWord
13321336
-- Construction
13331337
------------------------------------------------------------------------------
13341338

1335-
-- | Only for Windows.
1339+
{-# INLINE isInvalidPathChar #-}
13361340
isInvalidPathChar :: Integral a => OS -> a -> Bool
13371341
isInvalidPathChar Posix x = x == 0
13381342
isInvalidPathChar Windows x =
@@ -1347,6 +1351,11 @@ isInvalidPathChar Windows x =
13471351
124 -> True -- '|'
13481352
_ -> x <= charToWord '\US'
13491353

1354+
countLeadingValid :: (Unbox a, Integral a) => OS -> Array a -> Int
1355+
countLeadingValid os path =
1356+
let f = Fold.takeEndBy_ (isInvalidPathChar os) Fold.length
1357+
in foldArr f path
1358+
13501359
-- XXX Supply it an array for checking and use a more efficient prefix matching
13511360
-- check.
13521361

@@ -1358,12 +1367,111 @@ isInvalidPathComponent = fmap (fmap charToWord)
13581367
, "LPT1","LPT2","LPT3","LPT4","LPT5","LPT6","LPT7","LPT8","LPT9"
13591368
]
13601369

1361-
-- Note: "//share/x" works in powershell.
1370+
validatePath :: (MonadThrow m, Integral a, Unbox a) => OS -> Array a -> m ()
1371+
validatePath Posix path =
1372+
let pathLen = Array.length path
1373+
validLen = countLeadingValid Posix path
1374+
in if pathLen == 0
1375+
then throwM $ InvalidPath "Empty path"
1376+
else if pathLen /= validLen
1377+
then throwM $ InvalidPath
1378+
$ "Null char found after " ++ show validLen ++ " characters."
1379+
else pure ()
1380+
validatePath Windows path
1381+
| Array.null path = throwM $ InvalidPath "Empty path"
1382+
| otherwise = do
1383+
-- XXX give position of the first invalid char.
1384+
if hasDrive path && postDriveSep > 1 -- "C://"
1385+
then throwM $ InvalidPath
1386+
$ "More than one separators between drive root and the path"
1387+
else if isAbsoluteUNC path
1388+
then
1389+
if postDriveSep > 1 -- "///x"
1390+
then throwM $ InvalidPath
1391+
$ "Path starts with more than two separators"
1392+
-- XXX covered by the previous check
1393+
-- else if Array.length path == postDriveSep + 2
1394+
-- then throwM $ InvalidPath $ "Only separators in share root"
1395+
else if invalidRootComponent -- "//prn/x"
1396+
then throwM $ InvalidPath
1397+
-- XXX print the invalid component name
1398+
$ "Special filename component in share root"
1399+
else if rootEndSeps /= 1 -- "//share//x"
1400+
then throwM $ InvalidPath
1401+
$ "Share name is needed and exactly one separator is needed "
1402+
++ "between share root and the path"
1403+
else if Array.null stem -- "//share/"
1404+
then throwM $ InvalidPath
1405+
$ "the share root must be followed by a non-empty path"
1406+
else pure ()
1407+
else pure ()
1408+
1409+
if stemLen /= validStemLen -- "x/x>y"
1410+
then throwM $ InvalidPath
1411+
$ "Disallowed char found after "
1412+
++ show (rootLen + validStemLen)
1413+
++ " characters. The invalid char is: "
1414+
++ show (chr (fromIntegral invalidVal))
1415+
++ " [" ++ show invalidVal ++ "]"
1416+
else if invalidComponent -- "x/prn/y"
1417+
-- XXX print the invalid component name
1418+
then throwM $ InvalidPath $ "Disallowed Windows filename in path"
1419+
else pure ()
1420+
1421+
where
1422+
1423+
postDrive = snd $ Array.unsafeSplitAt 2 path
1424+
postDriveSep = countLeadingBy (isSeparatorWord Windows) postDrive
1425+
1426+
-- XXX check invalid chars in the path root as well - except . and '?'?
1427+
(root, stem) = splitRoot Windows path
1428+
rootLen = Array.length root
1429+
stemLen = Array.length stem
1430+
validStemLen = countLeadingValid Windows stem
1431+
invalidVal = fromIntegral (Array.unsafeGetIndex validStemLen stem) :: Word16
1432+
1433+
rootEndSeps = countTrailingBy (isSeparatorWord Windows) root
1434+
1435+
-- TBD: We are not currently validating the sharenames against disallowed
1436+
-- file names. Apparently windows does not allow even sharenames with those
1437+
-- names. To match against sharenames we will have to strip the separators
1438+
-- and drive etc from the root. Or we can use the parsing routines
1439+
-- themselves to validate.
1440+
toUp w16 =
1441+
if w16 < 256
1442+
then charToWord $ toUpper (wordToChar w16)
1443+
else w16
1444+
1445+
-- Should we strip all space chars as in Data.Char.isSpace?
1446+
isSpace x = x == charToWord ' '
1447+
1448+
-- XXX instead of using a list based check, pass the array to the checker.
1449+
-- We do not need to upcase the array, it can be done in the checker. Thus
1450+
-- we do not need to create a new array, the original slice can be checked.
1451+
getBaseName x =
1452+
runIdentity
1453+
$ Stream.toList
1454+
$ fmap toUp
1455+
$ Array.read
1456+
$ Array.strip isSpace
1457+
$ fst $ Array.breakEndBy_ (== extensionWord) x
1458+
1459+
components =
1460+
runIdentity
1461+
. Stream.toList
1462+
. fmap getBaseName
1463+
. splitCompact False Windows
1464+
1465+
invalidRootComponent =
1466+
List.any (`List.elem` isInvalidPathComponent) (components root)
1467+
invalidComponent =
1468+
List.any (`List.elem` isInvalidPathComponent) (components stem)
1469+
1470+
-- Note: We can use powershell for testing path validity.
1471+
-- "//share/x" works in powershell.
13621472
-- But mixed forward and backward slashes do not work, it is treated as a path
13631473
-- relative to current drive e.g. "\\/share/x" is treated as "C:/share/x".
13641474

1365-
-- XXX Throw exception pinpointing the failure reason.
1366-
13671475
-- | Check if the filepath is valid i.e. does the operating system allow such a
13681476
-- path in listing or creating files?
13691477
--
@@ -1486,63 +1594,10 @@ isInvalidPathComponent = fmap (fmap charToWord)
14861594
-- >>> isValidWin "\\\\??\\x"
14871595
-- True
14881596
isValid :: (Integral a, Unbox a) => OS -> Array a -> Bool
1489-
isValid Posix path
1490-
| Array.null path = False
1491-
| foldArr (Fold.elem 0) path = False
1492-
| otherwise = True
1493-
isValid Windows path
1494-
| Array.null path = False
1495-
| otherwise =
1496-
not (foldArr (Fold.any (isInvalidPathChar Windows)) stem)
1497-
&& not (List.any (`List.elem` isInvalidPathComponent) (components stem))
1498-
&& not (hasDrive path && postDriveSep > 1)
1499-
&& not (isAbsoluteUNC path
1500-
&& ( postDriveSep > 1
1501-
|| Array.length path == postDriveSep + 2
1502-
|| List.any (`List.elem` isInvalidPathComponent) (components root)
1503-
-- || not (hasTrailingSeparator Windows root)
1504-
|| rootEndSeps /= 1
1505-
|| Array.null stem
1506-
)
1507-
)
1508-
1509-
where
1510-
1511-
postDrive = snd $ Array.unsafeSplitAt 2 path
1512-
postDriveSep = countLeadingBy (isSeparatorWord Windows) postDrive
1513-
1514-
(root, stem) = splitRoot Windows path
1515-
rootEndSeps = countTrailingBy (isSeparatorWord Windows) root
1516-
1517-
-- TBD: We are not currently validating the sharenames against disallowed
1518-
-- file names. Apparently windows does not allow even sharenames with those
1519-
-- names. To match against sharenames we will have to strip the separators
1520-
-- and drive etc from the root. Or we can use the parsing routines
1521-
-- themselves to validate.
1522-
toUp w16 =
1523-
if w16 < 256
1524-
then charToWord $ toUpper (wordToChar w16)
1525-
else w16
1526-
1527-
-- Should we strip all space chars as in Data.Char.isSpace?
1528-
isSpace x = x == charToWord ' '
1529-
1530-
-- XXX instead of using a list based check, pass the array to the checker.
1531-
-- We do not need to upcase the array, it can be done in the checker. Thus
1532-
-- we do not need to create a new array, the original slice can be checked.
1533-
getBaseName x =
1534-
runIdentity
1535-
$ Stream.toList
1536-
$ fmap toUp
1537-
$ Array.read
1538-
$ Array.strip isSpace
1539-
$ fst $ Array.breakEndBy_ (== extensionWord) x
1540-
1541-
components =
1542-
runIdentity
1543-
. Stream.toList
1544-
. fmap getBaseName
1545-
. splitCompact False Windows
1597+
isValid os path =
1598+
case validatePath os path of
1599+
Nothing -> False
1600+
Just _ -> True
15461601

15471602
-- A chunk is essentially an untyped Array i.e. Array Word8. We can either use
15481603
-- the term ByteArray for that or just Chunk. The latter is shorter and we have
@@ -1553,17 +1608,8 @@ isValid Windows path
15531608
-- the definition of the 'Path' type. On Windows, the array passed must be a
15541609
-- multiple of 2 bytes as the underlying representation uses 'Word16'.
15551610
{-# INLINE unsafeFromChunk #-}
1556-
unsafeFromChunk ::
1557-
#ifdef DEBUG
1558-
Unbox a =>
1559-
#endif
1560-
Array Word8 -> Array a
1561-
unsafeFromChunk =
1562-
#ifndef DEBUG
1563-
Array.unsafeCast
1564-
#else
1565-
fromJust . fromChunk
1566-
#endif
1611+
unsafeFromChunk :: Array Word8 -> Array a
1612+
unsafeFromChunk = Array.unsafeCast
15671613

15681614
-- XXX Also check for invalid chars on windows.
15691615

@@ -1572,33 +1618,31 @@ unsafeFromChunk =
15721618
-- representation uses 'Word16'.
15731619
--
15741620
-- Throws 'InvalidPath'.
1575-
fromChunk :: (MonadThrow m, Unbox a) => Array Word8 -> m (Array a)
1576-
fromChunk arr =
1621+
fromChunk :: forall m a. (MonadThrow m, Unbox a, Integral a) =>
1622+
OS -> Array Word8 -> m (Array a)
1623+
fromChunk Posix arr =
1624+
let arr1 = Array.unsafeCast arr :: Array a
1625+
in validatePath Posix arr1 >> pure arr1
1626+
fromChunk Windows arr =
15771627
case Array.cast arr of
15781628
Nothing ->
1579-
-- XXX Windows only message.
15801629
throwM
15811630
$ InvalidPath
15821631
$ "Encoded path length " ++ show (Array.byteLength arr)
15831632
++ " is not a multiple of 16-bit."
1584-
Just x -> pure x
1633+
Just x -> validatePath Windows x >> pure x
15851634

15861635
-- | Convert 'Path' to an array of bytes.
15871636
toChunk :: Array a -> Array Word8
15881637
toChunk = Array.asBytes
15891638

15901639
unsafeFromChars :: (Unbox a) =>
1591-
(Char -> Bool)
1592-
-> (Stream Identity Char -> Stream Identity a)
1640+
(Stream Identity Char -> Stream Identity a)
15931641
-> Stream Identity Char
15941642
-> Array a
1595-
unsafeFromChars _p encode s =
1596-
#ifndef DEBUG
1643+
unsafeFromChars encode s =
15971644
let n = runIdentity $ Stream.fold Fold.length s
15981645
in Array.fromPureStreamN n (encode s)
1599-
#else
1600-
fromJust (fromChars _p encode s)
1601-
#endif
16021646

16031647
-- Note: We do not sanitize the path i.e. remove duplicate separators, .
16041648
-- segments, trailing separator etc because that would require unnecessary
@@ -1609,29 +1653,21 @@ unsafeFromChars _p encode s =
16091653
-- XXX Writing a custom fold for parsing a Posix path may be better for
16101654
-- efficient bulk parsing when needed. We need the same code to validate a
16111655
-- Chunk where we do not need to create an array.
1612-
fromChars :: (MonadThrow m, Unbox a) =>
1613-
(Char -> Bool)
1656+
fromChars :: (MonadThrow m, Unbox a, Integral a) =>
1657+
OS
16141658
-> (Stream Identity Char -> Stream Identity a)
16151659
-> Stream Identity Char
16161660
-> m (Array a)
1617-
fromChars p encode s =
1618-
-- XXX on windows terminate at first invalid char
1619-
let lengths = Fold.tee Fold.length (Fold.takeEndBy_ p Fold.length)
1620-
(n, n1) = runIdentity $ Stream.fold lengths s
1621-
arr = Array.fromPureStreamN n (encode s)
1622-
sample = Stream.takeWhile p s
1623-
in
1624-
if n <= 0
1625-
then throwM $ InvalidPath "Path cannot be empty."
1626-
else if n1 < n
1627-
then throwM $ InvalidPath $ "Path contains a NULL char at position: "
1628-
++ show n1 ++ " after " ++ runIdentity (Stream.toList sample)
1629-
else pure arr
1630-
1631-
toChars :: (Monad m, Unbox a) => (Stream m a -> Stream m Char) -> Array a -> Stream m Char
1661+
fromChars os encode s =
1662+
let arr = unsafeFromChars encode s
1663+
in fromChunk os (Array.unsafeCast arr)
1664+
1665+
toChars :: (Monad m, Unbox a) =>
1666+
(Stream m a -> Stream m Char) -> Array a -> Stream m Char
16321667
toChars decode arr = decode $ Array.read arr
16331668

1634-
toString :: Unbox a => (Stream Identity a -> Stream Identity Char) -> Array a -> [Char]
1669+
toString :: Unbox a =>
1670+
(Stream Identity a -> Stream Identity Char) -> Array a -> [Char]
16351671
toString decode = runIdentity . Stream.toList . toChars decode
16361672

16371673
------------------------------------------------------------------------------
@@ -1697,6 +1733,8 @@ unsafeAppend os toStr a b =
16971733
-- need a joinRoot.
16981734
--
16991735
-- XXX Also, we cannot append "/" to "c:/" as it will make the path invalid.
1736+
-- XXX On Windows a path starting with / is not absolute and can be appended to
1737+
-- a path/drive ending with :.
17001738

17011739
-- | Note that append joins two paths using a separator between the paths.
17021740
-- Using append to join a root with a path segment can change the meaning of
@@ -1791,6 +1829,14 @@ eqWindowsAbsRootStrict a b =
17911829
(fmap toDefaultSeparator $ Array.read a)
17921830
(fmap toDefaultSeparator $ Array.read b)
17931831

1832+
-- XXX Use options in the same eqPath routine instead of having different
1833+
-- routines. On posix even macos can have case insensitive comparison.
1834+
-- ALLOW_RELATIVE_PATH_EQUALITY, IGNORE_TRAILING_SEPARATOR,
1835+
-- IGNORE_CASE.
1836+
--
1837+
-- The following options can be added later: PROCESS_PARENT_REFS,
1838+
-- DONT_IGNORE_REDUNDANT_SEPARATORS, DONT_IGNORE_DOT_COMPONENTS.
1839+
17941840
-- | Checks two paths for logical equality. It performs some normalizations on
17951841
-- the paths before comparing them, specifically it drops redundant path
17961842
-- separators between path segments and redundant "/./" components between
@@ -1961,6 +2007,7 @@ eqWindowsComponents a b =
19612007
-- strict equality routine.
19622008
--
19632009
-- * A leading dot is ignored, thus "./x == ./x" and "./x == x".
2010+
-- * On Windows leading non-root drive prefix is ignored "C:x == C:x"
19642011
-- * A trailing separator is ignored thus "x/ == x".
19652012
-- * On Windows the comparison is case insensitive thus "X == x".
19662013
--

0 commit comments

Comments
 (0)